1.4 Main filtering: Analysis post 1999
# Drop observations prior to 1999
main <- main %>%
filter(syear >= 1999)
cat("Dimensions:", dim(main), "\n")## Dimensions: 287810 49
We are examining the extent to which individuals’ subjective well-being can be predicted by objective factors that determine individual health decline and for which group this declaration bias is particularly large. And in that sense, which predictors (such as age, labor market status, personality traits and hand-grip strength) help improve predictions of individual health declines.
This report documents the steps taken to preprocess and analyze the SOEP data for health-related outcomes: subjective and objective health measures, predictors, and control variables. The transformations include cleaning, imputing missing values, and deriving new variables.
# Load required packages
my_packages <- c("tidyverse", "haven", "openxlsx", "labelled", "dplyr", "lubridate","ggplot2","skimr","ggcorrplot","patchwork","ggridges", "glmnet", "plm", "magrittr", "caret","waffle","randomForest","rpart","ranger","vip","pdp","rpart.plot")
for (p in my_packages) {
if (!require(p, character.only = TRUE)) {
install.packages(p, dependencies = TRUE)
}
library(p, character.only = TRUE)
}## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: haven
##
## Loading required package: openxlsx
##
## Loading required package: labelled
##
## Loading required package: skimr
##
## Loading required package: ggcorrplot
##
## Loading required package: patchwork
##
## Loading required package: ggridges
##
## Loading required package: glmnet
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loaded glmnet 4.1-8
##
## Loading required package: plm
##
##
## Attaching package: 'plm'
##
##
## The following objects are masked from 'package:dplyr':
##
## between, lag, lead
##
##
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: caret
##
## Loading required package: lattice
##
##
## Attaching package: 'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
## Loading required package: waffle
##
## Loading required package: randomForest
##
## randomForest 4.7-1.2
##
## Type rfNews() to see new features/changes/bug fixes.
##
##
## Attaching package: 'randomForest'
##
##
## The following object is masked from 'package:dplyr':
##
## combine
##
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
##
## Loading required package: rpart
##
## Loading required package: ranger
##
##
## Attaching package: 'ranger'
##
##
## The following object is masked from 'package:randomForest':
##
## importance
##
##
## Loading required package: vip
##
##
## Attaching package: 'vip'
##
##
## The following object is masked from 'package:utils':
##
## vi
##
##
## Loading required package: pdp
##
##
## Attaching package: 'pdp'
##
##
## The following object is masked from 'package:purrr':
##
## partial
##
##
## Loading required package: rpart.plot
main <- read_dta('/Users/salmahoumane/Documents/R/pl.dta') %>%
select(
"pid", "syear", "ple0008", "plh0035", "ple0055", "ple0056", "ple0072", "ple0081_h", "ple0086_v4",
"ple0080_v2", "ple0086_v1", "ple0086_v4", paste0("ple00", 11:23),
"ple0004", "ple0005", "ple0095", "plh0182", "plh0171",
"plh0042", "plh0033", "plh0032", "ple0010_h", "ple0006", "ple0007"
)
cat("Main dataset dimensions:", dim(main), "\n")## Main dataset dimensions: 377869 35
# Merge with `pgen` dataset
data_pgen <- read_dta('/Users/salmahoumane/Documents/R/pgen.dta') %>%
select("pid", "syear", "pgerwzeit", "pgemplst", "pglabnet", "pgtatzeit", "pgexpue", "pgstib","pgfamstd", "pgisced11")
cat("PGEN dataset dimensions:", dim(data_pgen), "\n")## PGEN dataset dimensions: 381814 10
main_before_merge <- dim(main) # Store dimensions before merge
main <- merge(main, data_pgen, by = c("pid", "syear"), all.x = TRUE)
cat("After merging with pgen: Rows =", nrow(main), "Cols =", ncol(main), "\n")## After merging with pgen: Rows = 377869 Cols = 43
## Same number of rows then the main dataset with 7 new rows -> Merge successful
# Check for duplicate rows introduced during merge
duplicates <- main %>%
group_by(pid, syear) %>%
summarise(count = n()) %>%
filter(count > 1)## `summarise()` has grouped output by 'pid'. You can override using the `.groups`
## argument.
if (nrow(duplicates) > 0) {
cat("Warning: Duplicates detected after merging with pgen!\n")
print(duplicates)
} else {
cat("No duplicates found after merging with pgen.\n")
}## No duplicates found after merging with pgen.
# Merge with `ppathl` dataset
data_ppathl <- read_dta('/Users/salmahoumane/Documents/R/ppathl.dta') %>%
select("pid", "hid", "syear", "psample", "gebjahr", "sex", "migback", "germborn")
cat("PPATHL dataset dimensions:", dim(data_ppathl), "\n")## PPATHL dataset dimensions: 634864 8
main_before_ppathl <- dim(main) # Store dimensions before second merge
main <- left_join(main, data_ppathl, by = c("pid", "syear"))
cat("After merging with ppathl: Rows =", nrow(main), "Cols =", ncol(main), "\n")## After merging with ppathl: Rows = 377869 Cols = 49
## Same number of rows then the main dataset with 6 new rows -> Merge successful
## Final dataset dimensions after all merges: 377869 49
# Clean and rename the `health` variable
main <- main %>%
rename(health = ple0008) %>%
filter(health != -1) %>% # Filter valid health values -> 398 observations dropped
mutate(health = case_when(health == 5 ~ 1, # Reverse the scale: 1 -> 5, 5 -> 1
health == 4 ~ 2,
health == 3 ~3,
health == 2 ~ 4,
health == 1 ~ 5))
sum(is.na(main$health)) # no missings## [1] 0
# Visualisation: Bar plot
ggplot(main, aes(x = factor(health))) +
geom_bar(fill = "steelblue") +
labs(
title = "Distribution of Self-Assessed Health",
x = "Health Category (1 = Poor, 5 = Excellent)",
y = "Count"
) +
theme_minimal()# Create `worr_health_categorical` and `worr_health_dummy`
main <- main %>%
rename(worr_health_categorical = plh0035) %>%
filter(worr_health_categorical != -1 & worr_health_categorical!= -4) %>% # drop those who refused to answer -> 1287 obs
mutate(
worr_health_categorical = case_when(
worr_health_categorical < 0 ~ NA_real_,
TRUE ~ worr_health_categorical
)
) %>%
group_by(pid) %>%
mutate(
worr_health_categorical = if_else(
syear %in% c(2011, 2012, 2013) & is.na(worr_health_categorical), #impute the years 2011-2013 because 11000 observations werent asked this question in these years
floor(
mean(
worr_health_categorical[syear %in% c(2010, 2014)],
na.rm = TRUE
)
),
worr_health_categorical
)
) %>%
ungroup() %>%
drop_na(worr_health_categorical) %>% # 1032
mutate(worr_health_categorical = case_when(
worr_health_categorical == 3 ~ 1, # Reverse the scale: 1 = no worries
worr_health_categorical == 2 ~ 2,
worr_health_categorical == 1 ~ 3), # 3 = lots of worries
worr_health_dummy = if_else(
worr_health_categorical >= 2, 1, 0
)
)
# Visualisation: Bar plot (categorical)
ggplot(main, aes(x = worr_health_categorical)) +
geom_bar(fill = "coral") +
labs(
title = "Distribution of Worries About Health",
x = "Worries About Health",
y = "Count"
) +
theme_minimal()# Visualisation: Pie Char (dummy)
main %>%
group_by(worr_health_dummy) %>%
summarize(count = n()) %>%
mutate(
proportion = count / sum(count),
label = paste0(round(proportion * 100, 1), "%")
) %>%
ggplot(aes(x = "", y = proportion, fill = factor(worr_health_dummy))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(
title = "Proportion of Health Worry (Dummy)",
fill = "Health Worry (Dummy)"
) +
theme_minimal() +
geom_text(aes(label = label), position = position_stack(vjust = 0.5))# Create health decline dummy for 2 years into the future
main <- main %>%
arrange(pid, syear) %>% # Ensure data is sorted by pid and syear
group_by(pid) %>%
mutate(
health_in_2yrs = dplyr::lead(health, n = 2, order_by = syear), # Use dplyr::lead()
health_decline_2yrs = case_when(
!is.na(health_in_2yrs) & health_in_2yrs < health ~ 1, # Decline
!is.na(health_in_2yrs) & health_in_2yrs >= health ~ 0, # No decline
TRUE ~ NA_real_ # NA for missing data
)
) %>%
ungroup()
# Check for missing values in health_in_2yrs
sum(is.na(main$health_in_2yrs)) # 81.113## [1] 81113
#Visualisation: Line plot
# Line plot: Average health over years
main %>%
group_by(syear) %>%
summarize(avg_health = mean(health, na.rm = TRUE)) %>%
ggplot(aes(x = syear, y = avg_health)) +
geom_line(color = "darkgreen") +
geom_point(color = "darkgreen") +
labs(
title = "Average Self-Assessed Health Over Years",
x = "Survey Year",
y = "Average Health"
) +
theme_minimal()# Create life_satisfaction and health_satisfaction variables
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
plh0182_NA = sum(is.na(plh0182)),
plh0182_not_asked = sum(plh0182 %in% c(-8, -5), na.rm = TRUE),
plh0182_other_neg = sum(plh0182 < 0 & !(plh0182 %in% c(-8, -5)), na.rm = TRUE),
plh0171_NA = sum(is.na(plh0171)),
plh0171_not_asked = sum(plh0171 %in% c(-8, -5), na.rm = TRUE),
plh0171_other_neg = sum(plh0171 < 0 & !(plh0171 %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 6
## plh0182_NA plh0182_not_asked plh0182_other_neg plh0171_NA plh0171_not_asked
## <int> <int> <int> <int> <int>
## 1 0 1840 519 0 2862
## # ℹ 1 more variable: plh0171_other_neg <int>
# Step 2: Drop those who refused to answer or answered invalidly
main <- main %>%
filter(!(plh0182 %in% c(-1, -2, -3) | plh0171 %in% c(-1, -2, -3))) # 908 observations
# Step 3: Impute remaining Values for plh0182 and plh0171 with Max 2-Year Lag/Lead
main <- main %>%
arrange(pid, syear) %>% # Ensure proper sorting
group_by(pid) %>%
mutate(
# Flag rows that will be imputed
imputed_flag_life = if_else(plh0182 %in% c(-5, -8) | is.na(plh0182), 1, 0),
imputed_flag_health = if_else(plh0171 %in% c(-5, -8) | is.na(plh0171), 1, 0),
# Replace -5 and -8 with NA
plh0182 = case_when(plh0182 %in% c(-5, -8) ~ NA_real_, TRUE ~ plh0182),
plh0171 = case_when(plh0171 %in% c(-5, -8) ~ NA_real_, TRUE ~ plh0171),
# Impute plh0182 (life satisfaction)
life_satisfaction = if_else(
is.na(plh0182),
coalesce(
if_else(syear - lag(syear) <= 2 & !is.na(lag(plh0182)), lag(plh0182), NA_real_),
if_else(dplyr::lead(syear) - syear <= 2 & !is.na(dplyr::lead(plh0182)), dplyr::lead(plh0182), NA_real_)
),
plh0182
),
# Impute plh0171 (health satisfaction)
health_satisfaction = if_else(
is.na(plh0171),
coalesce(
if_else(syear - lag(syear) <= 2 & !is.na(lag(plh0171)), lag(plh0171), NA_real_),
if_else(dplyr::lead(syear) - syear <= 2 & !is.na(dplyr::lead(plh0171)), dplyr::lead(plh0171), NA_real_)
),
plh0171
)
) %>%
ungroup()
# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
summarise(
life_satisfaction_NA = sum(is.na(life_satisfaction)),
health_satisfaction_NA = sum(is.na(health_satisfaction))
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 2
## life_satisfaction_NA health_satisfaction_NA
## <int> <int>
## 1 328 681
# Drop observations that are still missing
main <- main %>%
filter(!is.na(life_satisfaction) & !is.na(health_satisfaction)) # 681 observations dropped
# Step 4: Descriptive Statistics
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag_life == 1 | imputed_flag_health == 1) %>%
summarise(
Life_Satisfaction_Mean = mean(life_satisfaction, na.rm = TRUE),
Life_Satisfaction_Median = median(life_satisfaction, na.rm = TRUE),
Health_Satisfaction_Mean = mean(health_satisfaction, na.rm = TRUE),
Health_Satisfaction_Median = median(health_satisfaction, na.rm = TRUE),
Count = n()
)
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag_life == 0 & imputed_flag_health == 0) %>%
summarise(
Life_Satisfaction_Mean = mean(life_satisfaction, na.rm = TRUE),
Life_Satisfaction_Median = median(life_satisfaction, na.rm = TRUE),
Health_Satisfaction_Mean = mean(health_satisfaction, na.rm = TRUE),
Health_Satisfaction_Median = median(health_satisfaction, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 5
## Life_Satisfaction_Mean Life_Satisfaction_Median Health_Satisfaction_Mean
## <dbl> <dbl> <dbl>
## 1 7.53 8 7.06
## # ℹ 2 more variables: Health_Satisfaction_Median <dbl>, Count <int>
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 5
## Life_Satisfaction_Mean Life_Satisfaction_Median Health_Satisfaction_Mean
## <dbl> <dbl> <dbl>
## 1 7.21 8 6.79
## # ℹ 2 more variables: Health_Satisfaction_Median <dbl>, Count <int>
# The imputed observations display slighly larger health & life-satisfaction
# Step 5: Visualizations
# All Rows: Life Satisfaction
ggplot(main, aes(x = factor(life_satisfaction))) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Life Satisfaction - All Rows",
x = "Life Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Imputed Rows Only
ggplot(main %>% filter(imputed_flag_life == 1), aes(x = factor(life_satisfaction))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Life Satisfaction - Imputed Rows",
x = "Life Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag_life == 0), aes(x = factor(life_satisfaction))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Life Satisfaction - Non-Imputed Rows",
x = "Life Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# All Rows: Health Satisfaction
ggplot(main, aes(x = factor(health_satisfaction))) +
geom_bar(fill = "coral", color = "black") +
labs(
title = "Health Satisfaction - All Rows",
x = "Health Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Imputed Rows Only
ggplot(main %>% filter(imputed_flag_health == 1), aes(x = factor(health_satisfaction))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Health Satisfaction - Imputed Rows",
x = "Health Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag_health == 0), aes(x = factor(health_satisfaction))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Health Satisfaction - Non-Imputed Rows",
x = "Health Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Process height and weight
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
height_NA = sum(is.na(ple0006)),
height_not_asked = sum(ple0006 %in% c(-8, -5), na.rm = TRUE),
height_invalid = sum(ple0006 > 245 | (ple0006 < 0 & !(ple0006 %in% c(-8, -5))), na.rm = TRUE),
weight_NA = sum(is.na(ple0007)),
weight_not_asked = sum(ple0007 %in% c(-8, -5), na.rm = TRUE),
weight_invalid = sum(ple0007 < 0 & !(ple0007 %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing and invalid values before imputation:")## [1] "Breakdown of missing and invalid values before imputation:"
## # A tibble: 1 × 6
## height_NA height_not_asked height_invalid weight_NA weight_not_asked
## <int> <int> <int> <int> <int>
## 1 0 167036 400 0 167036
## # ℹ 1 more variable: weight_invalid <int>
# Step 2: Process and Impute Height with Lag/Lead (<= 2 years)
# Assumption for Imputation of height: does not change significantly for adults
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
# Track imputation flag for height
height_imputed = if_else(ple0006 > 245 | ple0006 <= 0 | is.na(ple0006), 1, 0),
# Set invalid or missing height values to missing
height = ifelse(ple0006 < 0 | ple0006 > 245, NA, ple0006),
# Forward fill missing values
height = zoo::na.locf(height, na.rm = FALSE),
# Backward fill missing values
height = zoo::na.locf(height, na.rm = FALSE, fromLast = TRUE)
) %>%
ungroup()
# Step 3: Process and Impute Weight with Lag/Lead (<= 2 years)
#imputation: The mean weight between the last observation before the missing and the first after. And for the ones that have only one before or one after, the one but only if it is an adjacent year.
main <- main %>%
filter(!(ple0007 %in% c(-1, -2, -3))) %>%
group_by(pid) %>%
arrange(syear, .by_group = TRUE) %>%
mutate(# Track imputation flag for height
weight_imputed = if_else(ple0007 <= 0 | is.na(ple0007), 1, 0),
ple0007 = case_when(ple0007<0 ~ NA_real_,
TRUE ~ ple0007),
# Create lag and lead weights
lag_weight = lag(ple0007),
lead_weight = dplyr::lead(ple0007),
lag_year = lag(syear),
lead_year = dplyr::lead(syear),
# Apply imputation rules
weight = case_when(
is.na(ple0007) & !is.na(lag_weight) & !is.na(lead_weight) &
(syear - lag_year == 1 & lead_year - syear == 1) ~ (lag_weight + lead_weight) / 2, # Mean of valid neighbors
is.na(ple0007) & !is.na(lag_weight) & (syear - lag_year == 1) ~ lag_weight, # Use previous value if adjacent
is.na(ple0007) & !is.na(lead_weight) & (lead_year - syear == 1) ~ lead_weight, # Use next value if adjacent
TRUE ~ ple0007 # Keep original if no condition is met
)
) %>%
select(-lag_weight, -lead_weight, -lag_year, -lead_year) %>% # Clean up temporary columns
ungroup()
# Step 4: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
summarise(
height_NA = sum(is.na(height)),
weight_NA = sum(is.na(weight)),
height_imputed = sum(height_imputed),
weight_imputed = sum(weight_imputed)
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 4
## height_NA weight_NA height_imputed weight_imputed
## <int> <int> <dbl> <dbl>
## 1 16533 64221 167132 167036
main <- main %>%
filter(!is.na(height) & !is.na(weight))
# Step 5: Calculate BMI and Create Categorical Variable
main <- main %>%
mutate(
BMI = as.numeric(weight / ((height / 100) ^ 2)), # Calculate BMI
bmi_categorical = case_when(
BMI < 18.5 ~ 1, #underweight
BMI >= 18.5 & BMI < 25 ~ 2, #normal
BMI >= 25 & BMI < 30 ~ 3, #overweight
BMI >= 30 ~ 4, # obese
TRUE ~ NA_real_
)
)
# Step 6: Descriptive Statistics for Imputed and Non-Imputed Rows
# Descriptive stats for all rows
all_stats <- main %>%
summarise(
Mean = mean(BMI, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Min = min(BMI, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Count = n()
)
# Descriptive stats for imputed rows
imputed_stats <- main %>%
filter(height_imputed == 1 | weight_imputed == 1) %>%
summarise(
Mean = mean(BMI, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Min = min(BMI, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Count = n()
)
# Descriptive stats for non-imputed rows
non_imputed_stats <- main %>%
filter(height_imputed == 0 & weight_imputed == 0) %>%
summarise(
Mean = mean(BMI, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Min = min(BMI, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for All Rows:")## [1] "Descriptive Statistics for All Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 26.1 25.4 22.7 28.7 11.0 197. 217781
## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 26.2 25.5 22.8 28.7 11.5 180. 102850
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 26.1 25.4 22.7 28.6 11.0 197. 114931
# Step 7: Visualizations
# Histogram for all rows
ggplot(main, aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "purple", color = "black", alpha = 0.7) +
labs(
title = "BMI Distribution - All Rows",
x = "BMI",
y = "Count"
) +
theme_minimal()# Histogram for imputed rows
ggplot(main %>% filter(height_imputed == 1 | weight_imputed == 1), aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black", alpha = 0.7) +
labs(
title = "BMI Distribution - Imputed Rows",
x = "BMI",
y = "Count"
) +
theme_minimal()# Histogram for non-imputed rows
ggplot(main %>% filter(height_imputed == 0 & weight_imputed == 0), aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
labs(
title = "BMI Distribution - Non-Imputed Rows",
x = "BMI",
y = "Count"
) +
theme_minimal()Number of Cigarettes has too little observations
# Create `smoking_dummy`
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
ple0081_h_NA = sum(is.na(ple0081_h)),
ple0081_h_not_asked = sum(ple0081_h %in% c(-5, -8), na.rm = TRUE),
ple0081_h_invalid = sum(ple0081_h < 0 & !(ple0081_h %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing and invalid values before imputation:")## [1] "Breakdown of missing and invalid values before imputation:"
## # A tibble: 1 × 3
## ple0081_h_NA ple0081_h_not_asked ple0081_h_invalid
## <int> <int> <int>
## 1 0 104642 106
# Step 2: Drop those who refused to answer or answered invalidly
main <- main %>%
filter(!(ple0081_h %in% c(-1, -3) | ple0086_v4 %in% c(-1, -3))) # 137 observations
# Step 3.1: Impute Smoking Status (ple0081_h) with Lag and Lead (Max 2 Years)
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
# Flag missing smoking values
smoking_imputed = if_else(ple0081_h %in% c(-5, -8), 1, 0),
# Determine if the individual was ever observed to smoke
ever_smoked = any(ple0081_h == 1, na.rm = TRUE),
# Impute smoking values
ple0081_h = case_when(
# If missing and the person ever smoked, impute using lag/lead within 2 years
ple0081_h %in% c(-5, -8) & ever_smoked ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0081_h), NA_real_)
lead_val <- if_else(dplyr::lead(syear) - syear <= 2, dplyr::lead(ple0081_h), NA_real_)
if_else(
!is.na(lag_val) & !is.na(lead_val),
(lag_val + lead_val) / 2,
coalesce(lag_val, lead_val)
)
},
# If missing and the person never smoked, set to 2
ple0081_h %in% c(-5, -8) & !ever_smoked ~ 2,
# Keep observed positive smoking values
ple0081_h > 0 ~ ple0081_h,
# Set other invalid cases to NA
TRUE ~ NA_real_
),
# Create smoking dummy (1 = Smoker, 0 = Non-Smoker)
smoking_dummy = case_when(
ple0081_h <= 1.5 ~ 1,
ple0081_h > 1.5 ~ 0,
TRUE ~ NA_real_
)
) %>%
ungroup()
# Step 3: Remove Remaining Invalid or Negative Values
main <- main %>%
filter(!is.na(smoking_dummy))
# Step 4: Descriptive Statistics
# Smoking Dummy
smoking_stats_all <- main %>%
summarise(
Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
Count = n()
)
smoking_stats_imputed <- main %>%
filter(smoking_imputed == 1) %>%
summarise(
Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
Count = n()
)
smoking_stats_non_imputed <- main %>%
filter(smoking_imputed == 0) %>%
summarise(
Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Smoking (All Rows):")## [1] "Descriptive Statistics for Smoking (All Rows):"
## # A tibble: 1 × 2
## Mean_Smoking Count
## <dbl> <int>
## 1 0.306 217644
## [1] "Descriptive Statistics for Smoking (Imputed Rows):"
## # A tibble: 1 × 2
## Mean_Smoking Count
## <dbl> <int>
## 1 0.347 104642
## [1] "Descriptive Statistics for Smoking (Non-Imputed Rows):"
## # A tibble: 1 × 2
## Mean_Smoking Count
## <dbl> <int>
## 1 0.269 113002
# Step 6: Visualizations
# Smoking Dummy
ggplot(main, aes(x = factor(smoking_dummy))) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Smoking Status - All Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
theme_minimal()ggplot(main %>% filter(smoking_imputed == 1), aes(x = factor(smoking_dummy))) +
geom_bar(fill = "orange", color = "black") +
labs(title = "Smoking Status - Imputed Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
theme_minimal()ggplot(main %>% filter(smoking_imputed == 0), aes(x = factor(smoking_dummy))) +
geom_bar(fill = "green", color = "black") +
labs(title = "Smoking Status - Non-Imputed Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
theme_minimal()Kick this out because it is way too little observations
# Create variables for worries about job security, financial situation, and economic situation
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
plh0042_refused = sum(plh0042 == -1, na.rm = TRUE),
plh0042_NA = sum(plh0042 %in% c(-5, -6), na.rm = TRUE),
plh0042_not_applicable = sum(plh0042 == -2, na.rm = TRUE),
plh0033_refused = sum(plh0033 == -1, na.rm = TRUE),
plh0033_invalid = sum(plh0033 == -2, na.rm = TRUE),
plh0032_NA = sum(plh0032 == -5, na.rm = TRUE),
plh0032_invalid = sum(plh0032 == -1, na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 7
## plh0042_refused plh0042_NA plh0042_not_applicable plh0033_refused
## <int> <int> <int> <int>
## 1 4352 3398 80367 340
## # ℹ 3 more variables: plh0033_invalid <int>, plh0032_NA <int>,
## # plh0032_invalid <int>
# Step 2: Lag/Lead Imputation (Max 2 Years)
main <- main %>%
filter(plh0033 != -1 & plh0042 != -1 & plh0032 != -1 & plh0033 != -2 & plh0032 != -2) %>%
arrange(pid, syear) %>% # Ensure data is ordered by pid and syear
group_by(pid) %>%
mutate(
# Impute -5 or -8 for `plh0042` with max 2-year lag/lead
worries_job_imputed = if_else(plh0042 %in% c(-5, -6), 1, 0),
plh0042 = case_when(
plh0042 %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0042), NA_real_)
lead_val <- if_else(dplyr::lead(syear) - syear <= 2, dplyr::lead(plh0042), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
plh0042 == -2 ~ 3,
TRUE ~ plh0042
),
# not necessary for plh0033 because no remaining missings
# Repeat for `plh0032`
worries_economic_imputed = if_else(plh0032 == -5, 1, 0),
plh0032 = case_when(
plh0032 == -5 ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0032), NA_real_)
lead_val <- if_else(dplyr::lead(syear) - syear <= 2, dplyr::lead(plh0032), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
TRUE ~ plh0032
)
) %>%
ungroup()
# Step 3: Create Categorical and Dummy Variables
main <- main %>%
mutate(
worr_job_categorical = case_when(plh0042 == 1 ~ 3, # no worries
plh0042 == 3 ~ 1, # large worries
TRUE ~ plh0042),
worr_financial_categorical = case_when(plh0033 == 1 ~ 3, # no worries
plh0033 == 3 ~ 1, # large worries
TRUE ~ plh0033),
worr_economic_categorical = case_when(plh0032 == 1 ~ 3, # no worries
plh0032 == 3 ~ 1, # large worries
TRUE ~ plh0032),
worr_job_dummy = ifelse(worr_job_categorical >= 2, 1, 0),
worr_financial_dummy = ifelse(worr_financial_categorical >= 2, 1, 0),
worr_economic_dummy = ifelse(worr_economic_categorical >= 2, 1, 0)
)
main <- main %>%
filter(!is.na(worr_job_categorical) & !is.na(worr_financial_categorical)& !is.na(worr_economic_categorical)) # XXX observations dropped# Define descriptive_stats function
descriptive_stats <- function(data, condition) {
data %>%
filter(!!rlang::enquo(condition)) %>% # Use enquo for capturing condition
summarise(
Job_Worries_Mean = mean(as.numeric(worr_job_dummy), na.rm = TRUE),
Financial_Worries_Mean = mean(as.numeric(worr_financial_dummy), na.rm = TRUE),
Economic_Worries_Mean = mean(as.numeric(worr_economic_dummy), na.rm = TRUE),
Count = n()
)
}
# Calculate statistics for all rows, imputed rows, and non-imputed rows
all_stats <- descriptive_stats(main, TRUE)
imputed_stats <- descriptive_stats(main, worries_job_imputed == 1 | worries_economic_imputed == 1)
non_imputed_stats <- descriptive_stats(main, worries_job_imputed == 0 & worries_economic_imputed == 0)
print("Descriptive Statistics for All Rows:")## [1] "Descriptive Statistics for All Rows:"
## # A tibble: 1 × 4
## Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
## <dbl> <dbl> <dbl> <int>
## 1 0.268 0.661 0.806 212710
## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 4
## Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
## <dbl> <dbl> <dbl> <int>
## 1 0.145 0.739 0.00500 5596
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 4
## Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
## <dbl> <dbl> <dbl> <int>
## 1 0.272 0.659 0.827 207114
# Step 5: Visualizations
# Drop rows with NAs in worry-related columns
main <- main %>%
drop_na(worr_job_categorical, worr_financial_categorical, worr_economic_categorical)
# Distribution of Worry Levels - All Rows
main %>%
select(worr_job_categorical, worr_financial_categorical, worr_economic_categorical) %>%
pivot_longer(cols = everything(), names_to = "Worry_Type", values_to = "Worry_Level") %>%
ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
geom_bar(position = "dodge", color = "black") +
labs(
title = "Distribution of Worry Levels - All Rows",
x = "Worry Level",
y = "Count",
fill = "Worry Type"
) +
theme_minimal()# Imputed Rows
main %>%
filter(worries_job_imputed == 1 | worries_economic_imputed == 1) %>%
pivot_longer(cols = c(worr_job_categorical, worr_economic_categorical),
names_to = "Worry_Type", values_to = "Worry_Level") %>%
ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
geom_bar(position = "dodge", color = "black") +
labs(
title = "Distribution of Worry Levels - Imputed Rows",
x = "Worry Level",
y = "Count",
fill = "Worry Type"
) +
theme_minimal()# Non-Imputed Rows
main %>%
filter(worries_job_imputed == 0 & worries_economic_imputed == 0) %>%
pivot_longer(cols = c(worr_job_categorical, worr_economic_categorical),
names_to = "Worry_Type", values_to = "Worry_Level") %>%
ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
geom_bar(position = "dodge", color = "black") +
labs(
title = "Distribution of Worry Levels - Non-Imputed Rows",
x = "Worry Level",
y = "Count",
fill = "Worry Type"
) +
theme_minimal()# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
summarise(
refused = sum(pgerwzeit %in% c(-1, -3), na.rm = TRUE),
not_applicable = sum(pgerwzeit == -2, na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 2
## refused not_applicable
## <int> <int>
## 1 123 87344
# Step 2: Clean Tenure
main <- main %>%
filter(pgerwzeit != -1 & pgerwzeit != -3) %>%
mutate(tenure = case_when(pgerwzeit == -2 ~ 0,
TRUE ~ pgerwzeit))
# Step 3: Visualizations
# All Rows
ggplot(main, aes(x = tenure)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Tenure - All Rows",
x = "Tenure (Years)",
y = "Count"
) +
theme_minimal()# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
summarise(
refused = sum(pglabnet %in% c(-1, -3), na.rm = TRUE),
not_applicable = sum(pglabnet == -2, na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 2
## refused not_applicable
## <int> <int>
## 1 0 86372
# Transform `pglabnet` into `net_income`
main <- main %>%
mutate(net_income = case_when(pglabnet == -2 ~ 0,
TRUE ~ pglabnet))
# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
summarise(
refused = sum(pgtatzeit %in% c(-1, -3), na.rm = TRUE),
not_applicable = sum(pgtatzeit == -2, na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 2
## refused not_applicable
## <int> <int>
## 1 3837 86371
# Transform `pgtatzeit` into `work_time_weekly`
main <- main %>%
filter(pgtatzeit != -1 & pgtatzeit != -3) %>%
mutate(
work_time_weekly = case_when(pgtatzeit == -2 ~ 0,
TRUE ~ pgtatzeit))
# Summary Statistics for Net Income
# All rows
net_income_all <- main %>%
summarise(
Mean = mean(net_income, na.rm = TRUE),
Median = median(net_income, na.rm = TRUE),
Min = min(net_income, na.rm = TRUE),
Max = max(net_income, na.rm = TRUE),
Count = sum(!is.na(net_income))
)
# Summary Statistics for Work Time Weekly
# All rows
work_time_all <- main %>%
summarise(
Mean = mean(work_time_weekly, na.rm = TRUE),
Median = median(work_time_weekly, na.rm = TRUE),
Min = min(work_time_weekly, na.rm = TRUE),
Max = max(work_time_weekly, na.rm = TRUE),
Count = sum(!is.na(work_time_weekly))
)
# Display Results
print("Net Income Summary Statistics - All Rows:")## [1] "Net Income Summary Statistics - All Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1 1032. 526 0 124219 208750
## [1] "\nWork Time Weekly Summary Statistics - All Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1 21.9 20 0 80 208750
# Plot for Net Income - All Rows
ggplot(main, aes(x = net_income)) +
geom_histogram(binwidth = 500, fill = "skyblue", color = "black", alpha = 0.7) +
labs(
title = "Net Income Distribution - All Rows",
x = "Net Income",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Plot for Work Time Weekly - All Rows
ggplot(main, aes(x = work_time_weekly)) +
geom_histogram(binwidth = 5, fill = "lightcoral", color = "black", alpha = 0.7) +
labs(
title = "Work Time Weekly Distribution - All Rows",
x = "Work Time (Hours/Week)",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Transform total years unemployed (pgexpue)
# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
summarise(
not_answered = sum(pgexpue == -1 , na.rm = TRUE)) # 1278
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 1
## not_answered
## <int>
## 1 1180
# Step 2: Clean Total Years Unemployment
main <- main %>%
rename(total_years_unemployment = pgexpue) %>%
filter(total_years_unemployment != -1)
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = total_years_unemployment)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Total Years Unemployed - All Rows",
x = "Total Years Unemployed",
y = "Count"
) +
theme_minimal()# Transform occupational status (pgstib)
main <- main %>%
filter(pgstib != -1) %>%
mutate(
occup_status = case_when(
pgstib >= 210 & pgstib <= 250 ~ "Blue Collar Employee", # Blue collar emp
pgstib >= 510 & pgstib <= 550 ~ "White Collar Employee", # White collar emp
pgstib >= 310 & pgstib <= 340 ~ "Agricultural Employee", # agriculture emp
pgstib >= 410 & pgstib <= 413 ~ "Agriculture - Self-Employed", # agriculture self-emp
pgstib >= 610 & pgstib <= 640 ~ "Public Servant", # Public servant
(pgstib >= 420 & pgstib <= 433) | pgstib == 560 ~ "White Collar - Self-Employed", # Self-employed
pgstib %in% c(12, -2, 10) ~ "Unemployed", # Unemployed
pgstib %in% c(11, 15) | (pgstib >= 110 & pgstib <= 150) ~ "Apprentice", # Apprentice
pgstib == 13 ~ "Retired", # Retired
TRUE ~ "Other" # Assign a default category for unmatched values
)
)
main <- main %>%
mutate(unemp_dummy = ifelse(occup_status == "Unemployed", 1, 0))
#Visualisation: Bar Plot
ggplot(main, aes(x = occup_status)) +
geom_bar(fill = "darkred", color = "black") +
labs(
title = "Distribution of Occupational Status",
x = "Occupational Status",
y = "Count"
) +
theme_minimal()# Transform sex into male dummy
main <- main %>%
filter(sex != -3) %>%
mutate(male = ifelse(sex == 1, 1, 0))
#Visualisation: Pie Chart
main %>%
group_by(male) %>%
summarize(count = n()) %>%
mutate(
proportion = count / sum(count),
label = paste0(round(proportion * 100, 1), "%")
) %>%
ggplot(aes(x = "", y = proportion, fill = factor(male, labels = c("Female", "Male")))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(
title = "Proportion of Male vs. Female",
fill = "Gender"
) +
theme_minimal() +
geom_text(aes(label = label), position = position_stack(vjust = 0.5))# Transform migration background and create dummy
main <- main %>%
mutate(germborn = ifelse(migback == 2, 0, 1),
migback=ifelse(migback >= 2, 1, 0))
#Visualisation: Bar plot
ggplot(main, aes(x = migback)) +
geom_bar(fill = "cyan", color = "black") +
labs(
title = "Distribution of Migration Background",
x = "Migration Background",
y = "Count"
) +
theme_minimal()## Warning: Unknown or uninitialised column: `pgfamstd_dummy`.
## [1] "Number of NA rows in pgfamstd_dummy before imputation: 0"
# depending on that make imputation
# Transform marital status and handle NA with lag/lead logic
main <- main %>%
filter(pgfamstd != -1 & pgfamstd != -3) %>%
mutate(rel_status = case_when(pgfamstd == 1 | pgfamstd == 7 ~ "married",
pgfamstd == 2 | pgfamstd == 8 ~ "separated",
pgfamstd == 3 ~ "single",
pgfamstd == 4 ~ "divorced",
pgfamstd == 5 ~ "widowed",
pgfamstd == 6 ~ "spouse abroad",
TRUE ~ NA_character_),
rel_status = factor(rel_status))
sum(is.na(main$rel_status)) ## [1] 2
# Drop the two missings because relationship status is too fluctuating
main <- main %>%
filter(!is.na(rel_status))
# Visualizations
# All Rows
ggplot(main, aes(x = rel_status)) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Distribution of Relationship Status - All Rows",
x = "Relationship Status",
y = "Count"
) +
theme_minimal()
## 4.9 Age
main <- main %>%
filter(gebjahr != -1) %>%
mutate(age = syear - gebjahr)
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = age)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Agge",
x = "Age",
y = "Count"
) +
theme_minimal()# Filter for age between 25 and 55
main <- main %>%
filter(age >= 25 & age <= 55)
ggplot(main, aes(x = age)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Agge",
x = "Age",
y = "Count"
) +
theme_minimal()## [1] 46684
main <- main %>%
mutate(educ = as.numeric(pgisced11),
educ = case_when(educ < 0 ~ NA_real_,
TRUE ~ educ))
main <- main %>%
mutate(educ_imputed = ifelse(is.na(educ), 1,0)) %>%
arrange(pid, syear) %>% # Ensure data is sorted by individual and time
group_by(pid) %>%
fill(educ, .direction = "down") %>% # Carry forward values
fill(educ, .direction = "up") %>% #also carry upward because all observations are older than 25
ungroup() %>%
filter(!is.na(educ))
# Labels
# [1] Primary education
# [2] Lower secondary education
# [3] Upper secondary education
# [4] Post-secondary non-tertiary educati
# [5] Short-cycle tertiary education
# [6] Bachelor s or equivalent level
# [7] Master s or equivalent level
# [8] Doctoral or equivalent level
# All Rows
ggplot(main, aes(x = educ)) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Distribution of Education Level - All Rows",
x = "Education Level",
y = "Count"
) +
theme_minimal()# Imputed Rows only
main %>%
filter(educ_imputed == 1) %>%
ggplot(aes(x = educ)) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Distribution of Education Level - Imputed Rows",
x = "Education Level",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows only
main %>%
filter(educ_imputed == 0) %>%
ggplot(aes(x = educ)) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Distribution of Education Level - Not Imputed Rows",
x = "Education Level",
y = "Count"
) +
theme_minimal()## [1] "pid" "syear"
## [3] "health" "worr_health_categorical"
## [5] "ple0055" "ple0056"
## [7] "ple0072" "ple0081_h"
## [9] "ple0086_v4" "ple0080_v2"
## [11] "ple0086_v1" "ple0011"
## [13] "ple0012" "ple0013"
## [15] "ple0014" "ple0015"
## [17] "ple0016" "ple0017"
## [19] "ple0018" "ple0019"
## [21] "ple0020" "ple0021"
## [23] "ple0022" "ple0023"
## [25] "ple0004" "ple0005"
## [27] "ple0095" "plh0182"
## [29] "plh0171" "plh0042"
## [31] "plh0033" "plh0032"
## [33] "ple0010_h" "ple0006"
## [35] "ple0007" "pgerwzeit"
## [37] "pgemplst" "pglabnet"
## [39] "pgtatzeit" "total_years_unemployment"
## [41] "pgstib" "pgfamstd"
## [43] "pgisced11" "hid"
## [45] "psample" "gebjahr"
## [47] "sex" "migback"
## [49] "germborn" "worr_health_dummy"
## [51] "health_in_2yrs" "health_decline_2yrs"
## [53] "imputed_flag_life" "imputed_flag_health"
## [55] "life_satisfaction" "health_satisfaction"
## [57] "height_imputed" "height"
## [59] "weight_imputed" "weight"
## [61] "BMI" "bmi_categorical"
## [63] "smoking_imputed" "ever_smoked"
## [65] "smoking_dummy" "worries_job_imputed"
## [67] "worries_economic_imputed" "worr_job_categorical"
## [69] "worr_financial_categorical" "worr_economic_categorical"
## [71] "worr_job_dummy" "worr_financial_dummy"
## [73] "worr_economic_dummy" "tenure"
## [75] "net_income" "work_time_weekly"
## [77] "occup_status" "unemp_dummy"
## [79] "male" "rel_status"
## [81] "age" "educ"
## [83] "educ_imputed"
# Select only the relevant cleaned variables
main <- main %>%
select(
pid, syear, gebjahr, migback, germborn, health, worr_health_categorical,
total_years_unemployment, worr_health_dummy, health_in_2yrs, health_decline_2yrs,
life_satisfaction, health_satisfaction, height, weight, BMI, bmi_categorical,
smoking_dummy, ever_smoked, worr_job_categorical, worr_financial_categorical, worr_economic_categorical, worr_job_dummy, worr_financial_dummy,worr_economic_dummy,
tenure, net_income, work_time_weekly, occup_status, unemp_dummy, male, rel_status,
age, educ
)
# Count missing values for each column
missing_counts <- colSums(is.na(main))
# View the result
missing_counts## pid syear
## 0 0
## gebjahr migback
## 0 0
## germborn health
## 0 0
## worr_health_categorical total_years_unemployment
## 0 0
## worr_health_dummy health_in_2yrs
## 0 18017
## health_decline_2yrs life_satisfaction
## 18017 0
## health_satisfaction height
## 0 0
## weight BMI
## 0 0
## bmi_categorical smoking_dummy
## 0 0
## ever_smoked worr_job_categorical
## 0 0
## worr_financial_categorical worr_economic_categorical
## 0 0
## worr_job_dummy worr_financial_dummy
## 0 0
## worr_economic_dummy tenure
## 0 0
## net_income work_time_weekly
## 0 0
## occup_status unemp_dummy
## 0 0
## male rel_status
## 0 0
## age educ
## 0 0
## Warning in Ops.factor(left, right): '<' not meaningful for factors
## pid syear
## 0 0
## gebjahr migback
## 0 0
## germborn health
## 0 0
## worr_health_categorical total_years_unemployment
## 0 0
## worr_health_dummy health_in_2yrs
## 0 0
## health_decline_2yrs life_satisfaction
## 0 0
## health_satisfaction height
## 0 0
## weight BMI
## 0 0
## bmi_categorical smoking_dummy
## 0 0
## ever_smoked worr_job_categorical
## 0 2321
## worr_financial_categorical worr_economic_categorical
## 0 4002
## worr_job_dummy worr_financial_dummy
## 0 0
## worr_economic_dummy tenure
## 0 0
## net_income work_time_weekly
## 0 0
## occup_status unemp_dummy
## 0 0
## male rel_status
## 0 0
## age educ
## 0 0
main <- main %>%
filter(!is.na(health_decline_2yrs) & !is.na(health_in_2yrs) & worr_job_categorical > 0 & worr_economic_categorical > 0)
#Display the summary of the dataset
skim(main)| Name | main |
| Number of rows | 74326 |
| Number of columns | 34 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 1 |
| logical | 1 |
| numeric | 31 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| occup_status | 0 | 1 | 5 | 28 | 0 | 9 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| rel_status | 0 | 1 | FALSE | 6 | mar: 46899, sin: 17549, div: 6969, sep: 2166 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| ever_smoked | 0 | 1 | 0.42 | FAL: 42994, TRU: 31332 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| pid | 0 | 1 | 10929626.47 | 11544005.60 | 5201.00 | 2532901.00 | 5028152.00 | 21143401.00 | 38681902.00 | ▇▁▂▁▁ |
| syear | 0 | 1 | 2011.23 | 5.23 | 2001.00 | 2007.00 | 2012.00 | 2016.00 | 2019.00 | ▅▅▅▇▇ |
| gebjahr | 0 | 1 | 1970.23 | 8.83 | 1955.00 | 1963.00 | 1969.00 | 1977.00 | 1994.00 | ▅▇▆▃▁ |
| migback | 0 | 1 | 0.19 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| germborn | 0 | 1 | 0.85 | 0.35 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| health | 0 | 1 | 3.56 | 0.89 | 1.00 | 3.00 | 4.00 | 4.00 | 5.00 | ▁▂▅▇▂ |
| worr_health_categorical | 0 | 1 | 1.79 | 0.67 | 1.00 | 1.00 | 2.00 | 2.00 | 3.00 | ▆▁▇▁▂ |
| total_years_unemployment | 0 | 1 | 1.00 | 2.52 | 0.00 | 0.00 | 0.00 | 0.75 | 37.00 | ▇▁▁▁▁ |
| worr_health_dummy | 0 | 1 | 0.65 | 0.48 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| health_in_2yrs | 0 | 1 | 3.51 | 0.89 | 1.00 | 3.00 | 4.00 | 4.00 | 5.00 | ▁▂▅▇▂ |
| health_decline_2yrs | 0 | 1 | 0.25 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| life_satisfaction | 0 | 1 | 7.23 | 1.68 | 0.00 | 7.00 | 8.00 | 8.00 | 10.00 | ▁▁▂▇▃ |
| health_satisfaction | 0 | 1 | 6.99 | 2.05 | 0.00 | 6.00 | 7.00 | 8.00 | 10.00 | ▁▂▃▇▃ |
| height | 0 | 1 | 172.71 | 9.48 | 82.00 | 165.00 | 172.00 | 180.00 | 210.00 | ▁▁▁▇▁ |
| weight | 0 | 1 | 77.97 | 17.63 | 34.00 | 65.00 | 76.00 | 88.00 | 230.00 | ▇▇▁▁▁ |
| BMI | 0 | 1 | 26.04 | 5.14 | 12.03 | 22.58 | 25.24 | 28.41 | 197.23 | ▇▁▁▁▁ |
| bmi_categorical | 0 | 1 | 2.68 | 0.78 | 1.00 | 2.00 | 3.00 | 3.00 | 4.00 | ▁▇▁▆▃ |
| smoking_dummy | 0 | 1 | 0.37 | 0.48 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| worr_job_categorical | 0 | 1 | 1.49 | 0.66 | 1.00 | 1.00 | 1.00 | 2.00 | 3.00 | ▇▁▃▁▁ |
| worr_financial_categorical | 0 | 1 | 1.92 | 0.69 | 1.00 | 1.00 | 2.00 | 2.00 | 3.00 | ▅▁▇▁▃ |
| worr_economic_categorical | 0 | 1 | 2.08 | 0.65 | 1.00 | 2.00 | 2.00 | 2.00 | 3.00 | ▂▁▇▁▃ |
| worr_job_dummy | 0 | 1 | 0.39 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| worr_financial_dummy | 0 | 1 | 0.72 | 0.45 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| worr_economic_dummy | 0 | 1 | 0.83 | 0.38 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| tenure | 0 | 1 | 7.93 | 8.60 | 0.00 | 0.67 | 4.75 | 13.00 | 40.92 | ▇▂▂▁▁ |
| net_income | 0 | 1 | 1475.72 | 1495.12 | 0.00 | 450.00 | 1300.00 | 2045.00 | 124219.00 | ▇▁▁▁▁ |
| work_time_weekly | 0 | 1 | 30.97 | 18.56 | 0.00 | 17.00 | 39.00 | 43.00 | 80.00 | ▅▂▇▂▁ |
| unemp_dummy | 0 | 1 | 0.15 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| male | 0 | 1 | 0.46 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| age | 0 | 1 | 41.00 | 8.14 | 25.00 | 35.00 | 42.00 | 48.00 | 55.00 | ▅▆▇▇▆ |
| educ | 0 | 1 | 4.05 | 1.60 | 0.00 | 3.00 | 3.00 | 6.00 | 8.00 | ▁▇▂▃▂ |
# In order to identify key predictors and relationships between variables, we create a correlation heatmap between numerical variables
# Select numeric columns
numeric_data <- main[, sapply(main, is.numeric)]
# Calculate correlation matrix
corr_matrix <- cor(numeric_data, use = "complete.obs")
# Plot correlation heatmap
ggcorrplot(corr_matrix, method = "circle", lab = TRUE) +
labs(title = "Correlation Heatmap") +
theme_minimal()# Set correlations below 0.5 or above -0.5 to NA
filtered_corr <- corr_matrix
filtered_corr[abs(filtered_corr) < 0.5] <- NA
# Define pairs to exclude
exclude_pairs <- list(
c("worr_job_dummy", "worr_job_categorical"),
c("worr_job_categorical", "worr_job_dummy"),
c("worr_economic_dummy", "worr_economic_categorical"),
c("worr_economic_categorical", "worr_economic_dummy"),
c("worr_health_dummy", "worr_health_categorical"),
c("worr_health_categorical", "worr_health_dummy"),
c("bmi_categorical", "BMI"),
c("BMI", "bmi_categorical"),
c("BMI", "weight"),
c("weight", "BMI"),
c("germborn", "migback"),
c("migback", "germborn"),
c("worr_financial_dummy", "worr_financial_categorical"),
c("worr_financial_categorical", "worr_financial_dummy"),
c("age", "gebjahr"),
c("gebjahr", "age"),
c("bmi_categorical", "weight"),
c("weight", "bmi_categorical"),
c("worr_economic_dummy", "worr_economic_categorical"),
c("health_satisfaction", "health"),
c("work_time_weekly", "net_income"),
c("syear", "pid"),
c("pid", "syear"),
c("weight", "height"),
c("worr_health_categorical", "health"),
c("work_time_weekly", "unemp_dummy"),
c("life_satisfaction", "health_satisfaction"),
c("worr_health_categorical", "health_satisfaction"),
c("health_in_2yrs", "health"),
c("male", "height"),
c("health_in_2yrs", "health_satisfaction")
)
filtered_table <- as.data.frame(as.table(filtered_corr)) %>%
filter(
!is.na(Freq), # Exclude NA correlations
Var1 != Var2, # Exclude diagonal elements
!(paste(Var1, Var2, sep = "_") %in% sapply(exclude_pairs, function(x) paste(x[1], x[2], sep = "_"))) # Exclude specific pairs
) %>%
arrange(desc(abs(Freq))) # Sort by absolute correlation
# View the filtered table
filtered_table # Display the table# Create a bar plot
# Rename variables for prettier and clearer names
filtered_table <- filtered_table %>%
mutate(
Variable_Pairs = case_when(
Var1 == "health" & Var2 == "health_satisfaction" ~ "Health & Health Satisfaction",
Var1 == "height" & Var2 == "male" ~ "Height & Gender (Male)",
Var1 == "net_income" & Var2 == "work_time_weekly" ~ "Net Income & Work Time Weekly",
Var1 == "health" & Var2 == "health_in_2yrs" ~ "Health & Health in 2 years",
Var1 == "height" & Var2 == "weight" ~ "Height & Weight",
Var1 == "health_satisfaction" & Var2 == "health_in_2yrs" ~ "Health Satisfaction & Health in 2 years",
Var1 == "health_satisfaction" & Var2 == "life_satisfaction" ~ "Health Satisfaction & Life Satisfaction",
Var1 == "health_satisfaction" & Var2 == "worr_health_categorical" ~ "Health Satisfaction & Worrying about Health (Categorical)",
Var1 == "health" & Var2 == "worr_health_categorical" ~ "Health & Worrying about Health (Categorical)",
Var1 == "unemp_dummy" & Var2 == "work_time_weekly" ~ "Unemployment (Dummy) & Work Time Weekly",
TRUE ~ paste(Var1, Var2, sep = " & ") # Default to original pairs if not specified
)
)
# Create the bar plot
ggplot(filtered_table, aes(x = reorder(Variable_Pairs, Freq), y = Freq, fill = Freq > 0)) +
geom_bar(stat = "identity", color = "black") +
coord_flip() + # Flip coordinates for better readability
labs(
title = "Correlation Plot",
subtitle = "Visualizing Correlations Across Variable Pairs"
) +
scale_fill_manual(
values = c("#FF0000", "#163E64"), # Red for negative, dark blue for positive
guide = "none" # Remove the legend
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_blank(), # Remove x-axis title
axis.title.y = element_blank(), # Remove y-axis title
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis text
axis.text.y = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis text
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)# In order to highlight gender disparities in health perceptions, we create these graphs per gender
# Group data by pid and gender (male), and summarize
main_unique <- main %>%
group_by(pid, male) %>%
summarize(
health = mean(health, na.rm = TRUE),
health_decline_2yrs = mean(health_decline_2yrs, na.rm = TRUE),
worr_health_dummy = mean(worr_health_dummy, na.rm = TRUE),
health_satisfaction = mean(health_satisfaction, na.rm = TRUE),
.groups = "drop"
)
# Health per Gender: Violin Plot
ggplot(main_unique, aes(x = factor(male), y = health, fill = factor(male))) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") + # Add transparency to the violins
stat_summary(fun = "mean", geom = "point", shape = 16, size = 3, color = "black") + # Add mean points
geom_text(stat = "summary", fun = "mean", aes(label = round(after_stat(y), 2)), vjust = -0.5) + # Add mean labels
scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) + # Update x-axis labels
scale_fill_manual(values = c("#FF0000", "#163E64")) + # Red for Female, Dark Blue for Male
labs(
title = "Health Distribution by Gender",
subtitle = "Violin Plot Showing Health Scores",
x = "Gender",
y = "Health"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for health_decline_2yrs per Gender: Histogram
ggplot(main_unique, aes(x = factor(male), y = health_decline_2yrs, fill = factor(male))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and define outliers
scale_fill_manual(values = c("#FF0000", "#163E64"), labels = c("Female", "Male"), name = "Gender") + # Custom fill colors
scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) + # Update x-axis labels
labs(
title = "Health Decline in 2 Years by Gender",
subtitle = "Boxplot Comparison of Health Decline Across Genders",
x = "Gender",
y = "Health Decline"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none"
)## Saving 7 x 5 in image
# Graph for worr_health_dummy per Gender: Histogram
ggplot(main_unique, aes(x = factor(male), y = worr_health_dummy, fill = factor(male))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and define outliers
scale_fill_manual(values = c("#FF0000", "#163E64"), labels = c("Female", "Male"), name = "Gender") + # Custom fill colors
scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) + # Update x-axis labels
labs(
title = "Health Worries by Gender",
subtitle = "Boxplot of Health Worries for Females and Males",
x = "Gender",
y = "Health Worries"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "None"
)## Saving 7 x 5 in image
# Graph for health_satisfaction per Gender: Density plot
ggplot(main_unique, aes(x = health_satisfaction, fill = factor(male), color = factor(male))) +
geom_density(alpha = 0.5, size = 1) + # Add transparency and adjust line thickness
scale_fill_manual(values = c("#FF0000", "#163E64"), labels = c("Female", "Male"), name = "Gender") + # Custom fill colors
scale_color_manual(values = c("#FF0000", "#163E64"), labels = c("Female", "Male"), name = "Gender") + # Custom line colors
labs(
title = "Density of Health Satisfaction by Gender",
subtitle = "Density Plot Comparing Female and Male Responses",
x = "Health Satisfaction",
y = "Density",
fill = "Gender",
color = "Gender"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none"
)## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Saving 7 x 5 in image
We notice that gender differences are insignificant on health variables. We, therefore, do not expect gender to be a strong predictor.
# Group data by pid and educ, and summarize
main_educ <- main %>%
group_by(pid, educ) %>%
summarize(
health = mean(health, na.rm = TRUE),
health_decline_2yrs = mean(health_decline_2yrs, na.rm = TRUE),
worr_health_dummy = mean(worr_health_dummy, na.rm = TRUE),
health_satisfaction = mean(health_satisfaction, na.rm = TRUE),
.groups = "drop"
)
# Count of individuals per Educ
ggplot(main_educ %>% filter(educ != 0), aes(x = factor(educ), fill = factor(educ))) +
geom_bar(alpha = 0.7, color = "black") +
scale_x_discrete(
labels = c(
"1" = "Primary",
"2" = "Lower Secondary",
"3" = "Upper Secondary",
"4" = "Post-Secondary",
"5" = "Short-Cycle Tertiary",
"6" = "Bachelor's",
"7" = "Master's",
"8" = "Doctoral"
),
limits = c("1", "2", "3", "4", "5", "6", "7", "8") # Exclude 0 explicitly
) +
scale_fill_brewer(palette = "RdBu") +
labs(
title = "Distribution of Individuals by Educational Level",
x = "Educational Level",
y = "Count",
fill = "Educational Level"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none"
)## Saving 7 x 5 in image
# Health per Educ:
ggplot(main_educ %>% filter(educ != 0), aes(x = health, fill = factor(educ))) +
geom_density(alpha = 0.7, color = "black") +
facet_wrap(~ factor(educ), ncol = 2, labeller = as_labeller(c(
"1" = "Primary",
"2" = "Lower Secondary",
"3" = "Upper Secondary",
"4" = "Post-Secondary",
"5" = "Short-Cycle Tertiary",
"6" = "Bachelor's",
"7" = "Master's",
"8" = "Doctoral"
))) +
scale_fill_brewer(palette = "RdBu", guide = "none") + # Use diverging red/blue palette
labs(
title = "Density of Health by Educational Level",
x = "Health",
y = "Density",
fill = "Educational Level"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_blank(), # Dark blue y-axis labels
# Facet title styling
strip.text = element_text(size = 12, face = "bold", color = "#1E2B4F", margin = ggplot2::margin(b = 5)), # Style facet labels
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
)## Saving 7 x 5 in image
# Graph for health_decline_2yrs per educ:
ggplot(main_educ %>% filter(educ != 0), aes(x = factor(educ), y = health_decline_2yrs, fill = factor(educ))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black border
scale_x_discrete(
labels = c(
"1" = "Primary",
"2" = "Lower Secondary",
"3" = "Upper Secondary",
"4" = "Post-Secondary",
"5" = "Short-Cycle Tertiary",
"6" = "Bachelor's",
"7" = "Master's",
"8" = "Doctoral"
)
) +
scale_fill_brewer(palette = "RdBu", guide = "none") + # Use Blues palette
labs(
title = "Health Decline in 2 Years by Educational Level",
subtitle = "Boxplot of Health Decline Across Education Levels",
x = "Educational Level",
y = "Health Decline",
fill = "Educational Level"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for worr_health_dummy per educ:
ggplot(main_educ %>% filter(educ != 0), aes(x = factor(educ), y = worr_health_dummy, fill = factor(educ))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black borders
scale_x_discrete(
labels = c(
"1" = "Primary",
"2" = "Lower Secondary",
"3" = "Upper Secondary",
"4" = "Post-Secondary",
"5" = "Short-Cycle Tertiary",
"6" = "Bachelor's",
"7" = "Master's",
"8" = "Doctoral"
)
) +
scale_fill_brewer(palette = "RdBu", guide = "none") + # Use Blues palette
labs(
title = "Health Worries by Educational Level",
subtitle = "Boxplot of Health Worries Across Education Levels",
x = "Educational Level",
y = "Health Worries",
fill = "Educational Level"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for health_satisfaction per educ:
ggplot(main_educ %>% filter(educ != 0), aes(x = factor(educ), y = health_satisfaction, fill = factor(educ))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black border
scale_x_discrete(
labels = c(
"1" = "Primary",
"2" = "Lower Secondary",
"3" = "Upper Secondary",
"4" = "Post-Secondary",
"5" = "Short-Cycle Tertiary",
"6" = "Bachelor's",
"7" = "Master's",
"8" = "Doctoral"
)
) +
scale_fill_brewer(palette = "RdBu", guide = "none") +
labs(
title = "Health Satisfaction by Educational Level",
subtitle = "Boxplot of Health Satisfaction Across Education Levels",
x = "Educational Level",
y = "Health Satisfaction",
fill = "Educational Level"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
Our dataset mostly contains individuals with upper secondary education levels, with only a few primary school level and doctoral level. We expect those 2 groups to have a high variance.
In all graphs, we notice the same results: positive health perception as well as health satisfaction increases with education, and negative health perception through worrying or health decline decreases with education. We expect education to be correlated with income: people with higher education levels get higher salaries so the effects might be linked.
There are 3 individuals with income over 40k, we will be dropping them
# Aggregate data to calculate mean net income for each pid
cleaned_main <- main %>%
group_by(pid) %>%
summarize(net_income = mean(net_income, na.rm = TRUE), .groups = "drop") %>%
filter(net_income <= 40000) # Remove rows where mean income > 40000
# Distribution of income variable
ggplot(cleaned_main, aes(x = net_income)) +
geom_histogram(binwidth = 500, fill = "#163E64", color = "black", alpha = 0.7) + # Use dark blue fill and black borders
labs(
title = "Histogram of Net Income Distribution",
subtitle = "Distribution of Net Income Across Individuals",
x = "Net Income",
y = "Count"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space on the right and top/bottom
)## Saving 7 x 5 in image
# Group data by pid and educ, and summarize
income_data <- main %>%
filter(net_income < 40000) %>%
group_by(pid, net_income) %>%
summarize(
health = mean(health, na.rm = TRUE),
health_decline_2yrs = mean(health_decline_2yrs, na.rm = TRUE),
worr_health_dummy = mean(worr_health_dummy, na.rm = TRUE),
health_satisfaction = mean(health_satisfaction, na.rm = TRUE),
.groups = "drop"
)
# Health per Income:
ggplot(income_data, aes(x = net_income, y = health)) +
geom_smooth(
aes(group = 1), # Ensure a single LOESS line for all bins
method = "loess",
color = "#F00000", # Red for LOESS line
size = 1.2,
se = FALSE, # No confidence interval shading
span = 1
) +
stat_summary_bin(fun = "mean", bins = 30, geom = "point", color = "#163E64", size = 2) +
labs(
title = "Health vs. Income (Binned Scatter Plot)",
x = "Net Income",
y = "Average Health"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space on the right and top/bottom
)## `geom_smooth()` using formula = 'y ~ x'
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
# Graph for health_decline_2yrs per income:
income_data <- income_data %>%
mutate(net_income_bin = cut(net_income, breaks = 10)) # Create income bins
# Reformat income bins into human-readable labels
income_data <- income_data %>%
mutate(
net_income_bin = cut(
net_income,
breaks = 10,
labels = c(
"< 2.5K",
"2.5K - 5K",
"5K - 7.5K",
"7.5K - 10K",
"10K - 12.5K",
"12.5K - 15K",
"15K - 17.5K",
"17.5K - 20K",
"20K - 22.5K",
"> 22.5K"
)
)
)
ggplot(income_data, aes(x = net_income_bin, y = health_decline_2yrs, fill = net_income_bin)) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black border
scale_fill_brewer(palette = "RdBu", guide = "none") +
labs(
title = "Health Decline by Income (Binned)",
x = "Net Income (Binned)",
y = "Health Decline"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for worr_health_dummy per income:
ggplot(income_data, aes(x = worr_health_dummy, fill = net_income_bin)) +
geom_density(alpha = 0.5, color = "black", size = 0.5) + # Add transparency and black outline
scale_fill_brewer(palette = "RdBu", name = "Net Income (Binned)") +
labs(
title = "Density of Health Worries by Income",
x = "Health Worries",
y = "Density"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "right", # Move legend to the top
legend.text = element_text(size = 10, color = "#1E2B4F"), # Dark blue legend text
legend.title = element_text(size = 12, face = "bold", color = "#1E2B4F") # Bold and dark blue legend title
)## Saving 7 x 5 in image
# Group data by income bins and calculate average health satisfaction
heatmap_data <- income_data %>%
mutate(net_income_bin = cut(
net_income,
breaks = 10,
labels = c(
"< 2.5K",
"2.5K - 5K",
"5K - 7.5K",
"7.5K - 10K",
"10K - 12.5K",
"12.5K - 15K",
"15K - 17.5K",
"17.5K - 20K",
"20K - 22.5K",
"> 22.5K"
)
)) %>%
group_by(net_income_bin) %>%
summarize(avg_health_satisfaction = mean(health_satisfaction, na.rm = TRUE), .groups = "drop")
# Create the heatmap
ggplot(heatmap_data, aes(x = net_income_bin, y = 1, fill = avg_health_satisfaction)) +
geom_tile(color = "white") + # Add white borders for separation
scale_fill_gradient2(
low = "#67001F", # Deep Red
mid = "#F7F7F7", # Neutral White
high = "#053061", # Deep Blue
midpoint = mean(heatmap_data$avg_health_satisfaction, na.rm = TRUE) # Center the gradient around the mean
) +
labs(
title = "Average Health Satisfaction by Income Group",
x = "Net Income (Binned)",
y = "Health Satisfaction",
fill = ""
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_blank(), # Remove y-axis labels (since it’s a single-row heatmap)
axis.ticks.y = element_blank(), # Remove y-axis ticks
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"),
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 70, b = 20, l = 50), # Add more space around the plot
# Legend styling
legend.position = "right", # Move legend to the top
)## Saving 7 x 5 in image
Most of the participants in this dataset have a net income lower than 5000 euros and we see that for those, the higher the income, the higher the average health as we have a positively sloped line. However, for people with higher than 5000 euros income, we see a high variance, with some outliers therefore the negative trend cannot be confirmed.
ggplot(main, aes(x = factor(educ), y = net_income, fill = factor(educ))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") +
scale_x_discrete(
labels = c(
"1" = "Primary",
"2" = "Lower Secondary",
"3" = "Upper Secondary",
"4" = "Post-Secondary",
"5" = "Short-Cycle Tertiary",
"6" = "Bachelor's",
"7" = "Master's",
"8" = "Doctoral"
)
) +
scale_fill_brewer(palette = "RdBu", guide = "none") +
labs(
title = "Net Income by Education Level",
x = "Education Level",
y = "Net Income",
fill = "Education Level"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space on the right and top/bottom
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
There is not a significant relationship between education level and net income.
# Distribution of age variable
ggplot(main, aes(x = age)) +
geom_histogram(aes(y = ..density..), binwidth = 5, fill = "#A6CEE3", color = "black", alpha = 0.7) +
geom_density(color = "#1E2B4F", size = 1) +
labs(
title = "Combined Histogram and Density Plot of Age Distribution",
x = "Age",
y = "Density"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space around the plot
)## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Saving 7 x 5 in image
# Health per age:
# Filter and create age groups
age_data <- main %>%
filter(!is.na(age)) %>% # Remove NA values in the age column
mutate(
age_group = cut(
age,
breaks = c(25, 30, 35, 40, 45, 50, 55), # Define the breaks explicitly
include.lowest = TRUE, # Ensures 25 is included in the first group
right = TRUE, # Ensures 55 is included in the last group
labels = c("[25-30)", "[30-35)", "[35-40)", "[40-45)", "[45-50)", "[50-55]") # Labels with brackets
)
)
ggplot(age_data, aes(x = age_group, fill = age_group)) +
geom_bar(alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette = "Blues", guide = "none") + # Use a blue color palette
labs(
title = "Age Group Distribution",
x = "Age Group",
y = "Count"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Create violin plot for health by 5-year age groups
ggplot(age_data, aes(x = age_group, y = health, fill = age_group)) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") + # Add transparency and black border
scale_fill_brewer(palette = "Blues", guide = "none") + # Use a blue color palette
labs(
title = "Distribution of Health by 5-Year Age Groups",
x = "Age Group",
y = "Health"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for health_decline_2yrs per age:
ggplot(age_data, aes(x = age_group, fill = factor(health_decline_2yrs))) +
geom_bar(position = "fill", alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_manual(
values = c("#A6CEE3", "#1E2B4F"), # Light blue and dark blue
labels = c("No Decline", "Decline"),
name = "Health Decline"
) +
labs(
title = "Proportion of Health Decline by Age Group",
subtitle = "Stacked Bar Chart Showing Health Decline Across Age Groups",
x = "Age Group",
y = "Proportion",
fill = "Health Decline"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "top", # Move legend to the top
legend.text = element_text(size = 12, color = "#1E2B4F"), # Dark blue legend text
legend.title = element_text(size = 14, face = "bold", color = "#1E2B4F") # Bold and dark blue legend title
)## Saving 7 x 5 in image
# Graph for worr_health_dummy per age:
ggplot(age_data, aes(x = age_group, fill = factor(worr_health_dummy))) +
geom_bar(position = "fill", alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_manual(
values = c("#A6CEE3", "#1E2B4F"), # Light blue and dark blue
labels = c("No Worries", "Worries"),
name = "Health Worry"
) +
labs(
title = "Proportion of Health Worries by Age Group",
x = "Age Group",
y = "Proportion",
fill = "Health Worry"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "top", # Move legend to the top
legend.text = element_text(size = 12, color = "#1E2B4F"), # Dark blue legend text
legend.title = element_text(size = 14, face = "bold", color = "#1E2B4F") # Bold and dark blue legend title
)## Saving 7 x 5 in image
# Graph for health_satisfaction per age:
ggplot(age_data, aes(x = age_group, y = health_satisfaction, fill = age_group)) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black border
scale_fill_brewer(palette = "Blues", guide = "none") + # Use a blue color palette
labs(
title = "Health Satisfaction by Age Group",
x = "Age Group",
y = "Health Satisfaction"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
In our dataset, the most participants we have are around the age of 45. For all health variables, we notice the same trend: the positive subjective health perceptions are higher within younger individuals, health worries increases with age, and health satisfaction decreases with age. We also notice that health decline is stable accross all age groups at around 24%. Because we expect older individuals to have a higher income, we should look into the intersection of age and income and their impact on health.
ggplot(main, aes(x = age, y = net_income)) +
geom_point(alpha = 0.6, color = "#1E2B4F") + # Semi-transparent points with teal color
geom_smooth(method = "loess", color = "#F00000", se = TRUE, size = 1.2) + # Add a red LOESS trend line
labs(
title = "Net Income vs. Age",
x = "Age",
y = "Net Income"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space around the plot
)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! workspace required (8287256140) is too large probably because of setting 'se = TRUE'.
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! workspace required (8287256140) is too large probably because of setting 'se = TRUE'.
# Group data by pid and rel_status, and summarize
main_rel <- main %>%
group_by(pid, rel_status) %>%
summarize(
health = mean(health, na.rm = TRUE),
health_decline_2yrs = mean(health_decline_2yrs, na.rm = TRUE),
worr_health_dummy = mean(worr_health_dummy, na.rm = TRUE),
health_satisfaction = mean(health_satisfaction, na.rm = TRUE),
.groups = "drop"
)
# Distribution of rel_status variable
ggplot(main_rel, aes(x = factor(rel_status), fill = factor(rel_status))) +
geom_bar(alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette = "RdBu", guide = "none") +
labs(
title = "Distribution of Relationship Status",
x = "Relationship Status",
y = "Count"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Health per rel_status:
ggplot(main_rel, aes(x = factor(rel_status), y = health, fill = factor(rel_status))) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette = "RdBu", guide = "none") +
labs(
title = "Average Health by Relationship Status",
x = "Relationship Status",
y = "Average Health"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for health_decline_2yrs per rel_status:
ggplot(main_rel, aes(x = factor(rel_status), y = health_decline_2yrs, fill = factor(rel_status))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black border
scale_fill_brewer(palette = "RdBu", guide = "none") +
labs(
title = "Health Decline by Relationship Status",
x = "Relationship Status",
y = "Health Decline"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for worr_health_dummy per rel_status:
ggplot(main_rel, aes(x = factor(rel_status), y = worr_health_dummy, fill = factor(rel_status))) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette ="RdBu", guide = "none") +
labs(
title = "Average Health Worries by Relationship Status",
x = "Relationship Status",
y = "Average Health Worries"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 50, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for health_satisfaction per rel_status:
ggplot(main_rel, aes(x = factor(rel_status), y = health_satisfaction, fill = factor(rel_status))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette = "Set2", guide = "none") + # Use the Set2 palette for soft colors
labs(
title = "Health Satisfaction by Relationship Status",
x = "Relationship Status",
y = "Health Satisfaction"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
Our dataset contains mostly married individuals, which makes sense as our main population is around the age of 45. Average health is quite consistent across all relationship statuses, as well as health decline and health satisfaction. It is different for separated individuals and those with spouses abroad but the low number of these observations causing high variance does not allow us to make conclusions.
We notice that single individuals are the least worried about their health but we expect this to be caused by the younger age of single participants. We will therefore examine the relationship between marital status and age.
ggplot(main, aes(x = factor(rel_status), y = age, fill = factor(rel_status))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") +
scale_x_discrete(
labels = c(
"1" = "Single",
"2" = "Married",
"3" = "Divorced",
"4" = "Widowed"
)
) +
scale_fill_brewer(palette = "RdBu", guide = "none") + # Use the RdBu palette
labs(
title = "Age by Relationship Status",
x = "Relationship Status",
y = "Age"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
As expected, the single group is younger but this is not significant as the variance is quite high.
# Group data by pid and germborn, and summarize
main_germborn <- main %>%
group_by(pid, germborn) %>%
summarize(
health = mean(health, na.rm = TRUE),
health_decline_2yrs = mean(health_decline_2yrs, na.rm = TRUE),
worr_health_dummy = mean(worr_health_dummy, na.rm = TRUE),
health_satisfaction = mean(health_satisfaction, na.rm = TRUE),
.groups = "drop"
)
# Distribution of germborn variable
ggplot(main_germborn, aes(x = factor(germborn), fill = factor(germborn))) +
geom_bar(alpha = 0.7, color = "black") + # Add transparency and black borders
scale_x_discrete(
labels = c(
"0" = "Born outside Germany",
"1" = "Born in Germany"
)
) +
scale_fill_manual(
values = c("#A6CEE3", "#1E2B4F"),
guide = "none" # Remove legend
) +
labs(
title = "Distribution of German-Born Individuals",
x = "Migration Status",
y = "Count"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space around the plot
)## Saving 7 x 5 in image
# Health per germborn:
ggplot(main_germborn, aes(x = factor(germborn), y = health, fill = factor(germborn))) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_manual(
values = c("#A6CEE3", "#1E2B4F"), # Light blue and dark blue
labels = c("Born Outside of Germany", "Born in Germany"),
name = "German-Born Status"
) +
scale_x_discrete(
labels = c(
"0" = "Born Outside of Germany",
"1" = "Born in Germany"
)
) +
labs(
title = "Distribution of Health by Migration Status",
x = "",
y = "Health"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for health_decline_2yrs per germborn:
ggplot(main_germborn, aes(x = factor(germborn), y = health_decline_2yrs, fill = factor(germborn))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black borders
scale_fill_manual(
values = c("#A6CEE3", "#1E2B4F"), # Light blue and dark blue
labels = c("Born Outside of Germany", "Born in Germany"),
name = "Migration Status"
) +
scale_x_discrete(
labels = c(
"0" = "Born Outside of Germany",
"1" = "Born in Germany"
)
) +
labs(
title = "Health Decline by Migration Status",
x = "",
y = "Health Decline"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F", hjust = 0.5), # Centered and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# Graph for worr_health_dummy per germborn:
ggplot(main_germborn, aes(x = worr_health_dummy, fill = factor(germborn))) +
geom_density(alpha = 0.5, color = "black") + # Add transparency and black outline for density curves
scale_fill_manual(
values = c("#F00000", "#1E2B4F"),
labels = c("Born Outside of Germany", "Born in Germany"),
name = "Migration Status"
) +
labs(
title = "Density of Health Worries by Migration Status",
x = "Health Worries",
y = "Density"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "top", # Place legend at the top
legend.text = element_text(size = 12, color = "#1E2B4F"), # Dark blue legend text
legend.title = element_text(size = 14, face = "bold", color = "#1E2B4F") # Bold and dark blue legend title
)## Saving 7 x 5 in image
# Graph for health_satisfaction per germborn:
ggplot(main_germborn, aes(x = factor(germborn), y = health_satisfaction, fill = factor(germborn))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black borders
scale_fill_manual(
values = c("#A6CEE3", "#1E2B4F"), # Light blue and dark blue
labels = c("Born Outside of Germany", "Born in Germany"),
name = "Migration Status"
) +
scale_x_discrete(
labels = c(
"0" = "Born Outside of Germany",
"1" = "Born in Germany"
)
) +
labs(
title = "Health Satisfaction by Migration Status",
x = "",
y = "Health Satisfaction"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F", hjust = 0.5), # Bold and centered dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
Our dataset mostly contains individuals born in Germany, which makes sense as it is the SOEP data. For people born in Germany or Outside of Germany, health distributions are quite similar. We do notice that individuals born outside of Germany have smaller health declines but also higher health worries and lower health satisfaction. The small differences, the contradictory findings, merged with the fact that we only have a small sample of individuals born outside of Germany does not allow us to make conclusion.
# worr_health_categorical per health_satisfaction:
ggplot(main, aes(x = factor(worr_health_categorical), y = health_satisfaction, fill = factor(worr_health_categorical))) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette = "RdBu", guide = "none") + # Use the Blues palette for a gradient
scale_x_discrete(
labels = c(
"1" = "Does Not Worry",
"2" = "Worries a Little",
"3" = "Worries a Lot"
)
) +
labs(
title = "Health Satisfaction by Health Worry Categories",
x = "Health Worry Category",
y = "Health Satisfaction"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
# health_satisfaction per life_satisfaction:
ggplot(main, aes(x = life_satisfaction, y = health_satisfaction)) +
geom_bin2d(bins = 30) +
scale_fill_gradient(
low = "#A6CEE3", # Light blue
high = "#1E2B4F", # Dark blue
name = "Count" # Legend title
) +
labs(
title = "Health Satisfaction vs. Life Satisfaction",
x = "Life Satisfaction",
y = "Health Satisfaction"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "right", # Position legend on the right
legend.text = element_text(size = 12, color = "#1E2B4F"), # Dark blue legend text
legend.title = element_text(size = 14, face = "bold", color = "#1E2B4F") # Bold and dark blue legend title
)## Saving 7 x 5 in image
# health per BMI graph:
ggplot(main, aes(x = factor(bmi_categorical), y = health, fill = factor(bmi_categorical))) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") + # Add transparency and black borders
scale_fill_brewer(palette = "Blues", guide = "none") + # Use the Blues palette
scale_x_discrete(
labels = c(
"1" = "Underweight",
"2" = "Normal Weight",
"3" = "Overweight",
"4" = "Obese"
)
) +
labs(
title = "Health Distribution by BMI Category",
x = "BMI Category",
y = "Health"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, face = "bold", color = "#1E2B4F"), # Tilted and bold dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Legend styling
legend.position = "none" # Remove legend
)## Saving 7 x 5 in image
Further analyses to confirm our expectations, we see that the higher the health satisfaction, the lower the individuals worry about their health and that goes both ways.
Similarly, the higher life satisfaction, the higher health satisfaction as we observe a linear increasing relationship between the two.
Analyzing BMI, we also see that normal and overweight individuals show higher health. However, due to the low amount of observations of underweight and obese individuals, we cannot make conclusions.
# Define health group labels and titles
main <- main %>%
mutate(
health_group = case_when(
health >= 1 & health <= 2 ~ "1-2",
health > 2 & health <= 3 ~ "2-3",
health > 3 & health <= 5 ~ "4-5",
TRUE ~ NA_character_
)
)
health_titles <- c(
"1-2" = "Poor health",
"2-3" = "Moderate health",
"4-5" = "Good health"
)
# Function to calculate summary statistics for a variable
calculate_group_summary <- function(data, var_name, var_label) {
data %>%
summarise(
Mean = mean(!!sym(var_name), na.rm = TRUE),
SD = sd(!!sym(var_name), na.rm = TRUE),
Min = min(!!sym(var_name), na.rm = TRUE),
Max = max(!!sym(var_name), na.rm = TRUE),
Count = n()
) %>%
mutate(Variable = var_label)
}
# Variables to calculate statistics for
variables <- list(
list(name = "BMI", label = "BMI"),
list(name = "worr_health_dummy", label = "Worries About Health (Dummy)"),
list(name = "smoking_dummy", label = "Smoking (Dummy)"),
list(name = "health_in_2yrs", label = "Health in 2 Years"),
list(name = "health_decline_2yrs", label = "Expected Health Decline in 2 Years"),
list(name = "life_satisfaction", label = "Life Satisfaction"),
list(name = "health_satisfaction", label = "Health Satisfaction"),
list(name = "tenure", label = "Tenure"),
list(name = "net_income", label = "Net Income"),
list(name = "work_time_weekly", label = "Work Time Weekly"),
list(name = "total_years_unemployment", label = "Total Years Unemployment"),
list(name = "worr_job_dummy", label = "Job Worries (Dummy)"),
list(name = "worr_financial_dummy", label = "Financial Worries (Dummy)"),
list(name = "worr_economic_dummy", label = "Economic Worries (Dummy)"),
list(name = "male", label = "Gender (Male = 1)"),
list(name = "age", label = "Age"),
list(name = "educ", label = "Education Level")
)
# Initialize a list to store tables for each group
group_tables <- list()
# Generate separate tables for each health group
for (group in unique(main$health_group)) {
group_data <- main %>% filter(health_group == group)
group_table <- data.frame(
Variable = character(),
Mean = numeric(),
SD = numeric(),
Min = numeric(),
Max = numeric(),
Count = numeric(),
stringsAsFactors = FALSE
)
for (var in variables) {
stats <- calculate_group_summary(group_data, var$name, var$label)
if (!is.null(stats)) {
group_table <- bind_rows(group_table, stats)
}
}
# Round all numeric columns to 1 digit after the decimal point
group_table <- group_table %>%
mutate(
Mean = round(Mean, 1),
SD = round(SD, 1),
Min = round(Min, 1),
Max = round(Max, 1)
)
# Add the table to the list with the title as the key
table_title <- health_titles[group]
group_tables[[table_title]] <- group_table
}## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
## Warning: `..1$Min` and `..2$Min` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Min`.
## ✖ Values: -7 and -3
## Warning: `..1$Max` and `..2$Max` have conflicting value labels.
## ℹ Labels for these values will be taken from `..1$Max`.
## ✖ Values: -7 and -3
# Display each group's table with its title
for (title in names(group_tables)) {
print(paste("Summary statistics for:", title))
print(group_tables[[title]])
}## [1] "Summary statistics for: Poor health"
## Variable Mean SD Min Max Count
## 1 BMI 27.9 6.7 12 163.6 9006
## 2 Worries About Health (Dummy) 1.0 0.2 0 1.0 9006
## 3 Smoking (Dummy) 0.5 0.5 0 1.0 9006
## 4 Health in 2 Years 2.6 0.9 1 5.0 9006
## 5 Expected Health Decline in 2 Years 0.1 0.2 0 1.0 9006
## 6 Life Satisfaction 5.7 2.1 0 10.0 9006
## 7 Health Satisfaction 3.8 1.9 0 10.0 9006
## 8 Tenure 6.9 8.9 0 40.9 9006
## 9 Net Income 1069.2 1187.2 0 15000.0 9006
## 10 Work Time Weekly 24.8 20.5 0 80.0 9006
## 11 Total Years Unemployment 2.1 3.9 0 29.0 9006
## 12 Job Worries (Dummy) 0.4 0.5 0 1.0 9006
## 13 Financial Worries (Dummy) 0.9 0.4 0 1.0 9006
## 14 Economic Worries (Dummy) 0.9 0.4 0 1.0 9006
## 15 Gender (Male = 1) 0.4 0.5 0 1.0 9006
## 16 Age 43.4 7.8 25 55.0 9006
## 17 Education Level 3.6 1.5 1 8.0 9006
## [1] "Summary statistics for: Moderate health"
## Variable Mean SD Min Max Count
## 1 BMI 26.8 5.3 12.9 128.5 21384
## 2 Worries About Health (Dummy) 0.9 0.4 0.0 1.0 21384
## 3 Smoking (Dummy) 0.4 0.5 0.0 1.0 21384
## 4 Health in 2 Years 3.2 0.8 1.0 5.0 21384
## 5 Expected Health Decline in 2 Years 0.2 0.4 0.0 1.0 21384
## 6 Life Satisfaction 6.8 1.6 0.0 10.0 21384
## 7 Health Satisfaction 6.1 1.5 0.0 10.0 21384
## 8 Tenure 8.6 9.0 0.0 40.1 21384
## 9 Net Income 1425.6 1566.4 0.0 124219.0 21384
## 10 Work Time Weekly 31.2 18.5 0.0 80.0 21384
## 11 Total Years Unemployment 1.2 2.7 0.0 37.0 21384
## 12 Job Worries (Dummy) 0.5 0.5 0.0 1.0 21384
## 13 Financial Worries (Dummy) 0.8 0.4 0.0 1.0 21384
## 14 Economic Worries (Dummy) 0.9 0.4 0.0 1.0 21384
## 15 Gender (Male = 1) 0.5 0.5 0.0 1.0 21384
## 16 Age 42.3 7.9 25.0 55.0 21384
## 17 Education Level 3.9 1.5 1.0 8.0 21384
## [1] "Summary statistics for: Good health"
## Variable Mean SD Min Max Count
## 1 BMI 25.3 4.5 12.6 197.2 43936
## 2 Worries About Health (Dummy) 0.5 0.5 0.0 1.0 43936
## 3 Smoking (Dummy) 0.3 0.5 0.0 1.0 43936
## 4 Health in 2 Years 3.9 0.7 1.0 5.0 43936
## 5 Expected Health Decline in 2 Years 0.3 0.5 0.0 1.0 43936
## 6 Life Satisfaction 7.8 1.3 0.0 10.0 43936
## 7 Health Satisfaction 8.1 1.3 0.0 10.0 43936
## 8 Tenure 7.8 8.3 0.0 40.0 43936
## 9 Net Income 1583.5 1500.6 0.0 99999.0 43936
## 10 Work Time Weekly 32.1 17.9 0.0 80.0 43936
## 11 Total Years Unemployment 0.7 1.9 0.0 27.6 43936
## 12 Job Worries (Dummy) 0.4 0.5 0.0 1.0 43936
## 13 Financial Worries (Dummy) 0.7 0.5 0.0 1.0 43936
## 14 Economic Worries (Dummy) 0.8 0.4 0.0 1.0 43936
## 15 Gender (Male = 1) 0.5 0.5 0.0 1.0 43936
## 16 Age 39.9 8.1 25.0 55.0 43936
## 17 Education Level 4.2 1.6 0.0 8.0 43936
# Define health group labels and titles
main <- main %>%
mutate(
health_group = case_when(
health >= 1 & health <= 2 ~ "1-2",
health > 2 & health <= 3 ~ "2-3",
health > 3 & health <= 5 ~ "4-5",
TRUE ~ NA_character_
)
)
health_titles <- c(
"1-2" = "Poor health",
"2-3" = "Moderate health",
"4-5" = "Good health"
)
# Variables to calculate statistics for
variables <- list(
list(name = "BMI", label = "BMI"),
list(name = "worr_health_dummy", label = "Worries About Health (Dummy)"),
list(name = "smoking_dummy", label = "Smoking (Dummy)"),
list(name = "health_in_2yrs", label = "Health in 2 Years"),
list(name = "health_decline_2yrs", label = "Expected Health Decline in 2 Years"),
list(name = "life_satisfaction", label = "Life Satisfaction"),
list(name = "health_satisfaction", label = "Health Satisfaction"),
list(name = "tenure", label = "Tenure"),
list(name = "net_income", label = "Net Income"),
list(name = "work_time_weekly", label = "Work Time Weekly"),
list(name = "total_years_unemployment", label = "Total Years Unemployment"),
list(name = "employment", label = "Employment Status (Employed = 1)"),
list(name = "worr_job_dummy", label = "Job Worries (Dummy)"),
list(name = "worr_financial_dummy", label = "Financial Worries (Dummy)"),
list(name = "worr_economic_dummy", label = "Economic Worries (Dummy)"),
list(name = "male", label = "Gender (Male = 1)"),
list(name = "age", label = "Age"),
list(name = "educ", label = "Education Level"),
list(name = "university", label = "University Degree (Dummy)")
)
# Create a summary table
summary_table <- data.frame(Variable = sapply(variables, function(x) x$label))
# Calculate means for each health category
for (group in unique(main$health_group)) {
group_data <- main %>% filter(health_group == group)
group_means <- sapply(variables, function(var) mean(group_data[[var$name]], na.rm = TRUE))
summary_table[[health_titles[group]]] <- round(group_means, 1)
}## Warning in mean.default(group_data[[var$name]], na.rm = TRUE): argument is not
## numeric or logical: returning NA
## Warning in mean.default(group_data[[var$name]], na.rm = TRUE): argument is not
## numeric or logical: returning NA
## Warning in mean.default(group_data[[var$name]], na.rm = TRUE): argument is not
## numeric or logical: returning NA
## Warning in mean.default(group_data[[var$name]], na.rm = TRUE): argument is not
## numeric or logical: returning NA
## Warning in mean.default(group_data[[var$name]], na.rm = TRUE): argument is not
## numeric or logical: returning NA
## Warning in mean.default(group_data[[var$name]], na.rm = TRUE): argument is not
## numeric or logical: returning NA
# Add row for the number of observations in each health category
num_observations <- sapply(unique(main$health_group), function(group) nrow(main %>% filter(health_group == group)))
summary_table <- rbind(summary_table, c("Number of Observations", num_observations))
# Display the summary table
print(summary_table)## Variable Poor health Moderate health Good health
## 1 BMI 27.9 26.8 25.3
## 2 Worries About Health (Dummy) 1 0.9 0.5
## 3 Smoking (Dummy) 0.5 0.4 0.3
## 4 Health in 2 Years 2.6 3.2 3.9
## 5 Expected Health Decline in 2 Years 0.1 0.2 0.3
## 6 Life Satisfaction 5.7 6.8 7.8
## 7 Health Satisfaction 3.8 6.1 8.1
## 8 Tenure 6.9 8.6 7.8
## 9 Net Income 1069.2 1425.6 1583.5
## 10 Work Time Weekly 24.8 31.2 32.1
## 11 Total Years Unemployment 2.1 1.2 0.7
## 12 Employment Status (Employed = 1) <NA> <NA> <NA>
## 13 Job Worries (Dummy) 0.4 0.5 0.4
## 14 Financial Worries (Dummy) 0.9 0.8 0.7
## 15 Economic Worries (Dummy) 0.9 0.9 0.8
## 16 Gender (Male = 1) 0.4 0.5 0.5
## 17 Age 43.4 42.3 39.9
## 18 Education Level 3.6 3.9 4.2
## 19 University Degree (Dummy) <NA> <NA> <NA>
## 20 Number of Observations 9006 21384 43936
# Save the summary table as a CSV file
write.csv(summary_table, "summary_table_short.csv", row.names = FALSE)The table provides descriptive statistics for individuals categorized by their self-reported health status: Good health, Moderate health, and Poor health. Each variable is summarized for these three health categories, allowing us to analyze how different characteristics vary across health statuses. - BMI Individuals with poorer health tend to have higher BMI (Good: 25.3 → Poor: 27.8), suggesting a link between obesity and poorer health. - Worries About Health (Dummy) Health worries increase with poorer health (Good: 0.5 → Poor: 1.0), as expected. - Individuals in poorer health have significantly lower expectations of good health in the future (Good: 3.9 → Poor: 2.6), also as expected. - Similarly, individuals in Poor health report much lower health satisfaction (Good: 8.1 → Poor: 3.8) and a decline in life satisfaction as health worsens (Good: 7.8 → Poor: 5.7). - Higher unemployment duration is associated with poorer health (Good: 0.7 years → Poor: 2.1 years). - Worries about finances and the economy increase slightly as health worsens. - Individuals in Good health have significantly higher incomes (Good: €1574 → Poor: €1060). - Poor health correlates with increasing age (Good: 39.8 → Poor: 43.3). - Education level decreases with poorer health (Good: 4.2 → Poor: 3.6). Lower education might contribute to worse health outcomes. - Surprisingly, those in poorer health report less expected decline (Good: 0.3 → Poor: 0.1). This could indicate resignation or lower health awareness.
But, in general, in our dataset, ,ost individuals report Good health (45,372), with fewer in Moderate health (21,859) and the least in Poor health (9,276). This indicates a skewed distribution towards better health.
# Convert data to panel format
panel_data <- pdata.frame(main, index = c("pid", "syear")) # pid = individual ID, syear = time
# Prepare data for LASSO regression
lasso_data <- panel_data %>%
dplyr::select(
pid, syear, health_decline_2yrs, health_in_2yrs, worr_health_dummy, health, BMI, smoking_dummy, life_satisfaction, health_satisfaction,
tenure, net_income, work_time_weekly, total_years_unemployment,
worr_job_dummy, worr_financial_dummy, worr_economic_dummy,
male, age, educ
) %>%
na.omit() # Remove missing values
# Check class distribution in y
class_distribution <- table(lasso_data$health_decline_2yrs)
print(class_distribution)##
## 0 1
## 55840 18486
# Split the data into training and test data
set.seed(123) # For reproducibility
trainIndex <- createDataPartition (lasso_data$health_in_2yrs, p = 0.7, list = FALSE)
# Convert trainIndex to a vector
trainIndex <- as.vector(trainIndex)
# Use the vector to split the data
trainData <- lasso_data[trainIndex, ]
testData <- lasso_data[-trainIndex, ]
dim(trainData)## [1] 52030 20
#Standardize variables
# Define relevant numerical variables as a character vector
numerical_vars <- c(
"BMI", "smoking_dummy", "life_satisfaction", "health_satisfaction",
"tenure", "net_income", "work_time_weekly", "total_years_unemployment",
"worr_job_dummy", "worr_financial_dummy", "worr_economic_dummy",
"male", "age", "educ", "worr_health_dummy", "health"
)
# Apply pre-processing (center and scale) only to the selected numerical columns
preProcValues <- preProcess(trainData[, numerical_vars], method = c("center", "scale"))
# Standardize the training data
trainData[, numerical_vars] <- predict(preProcValues, trainData[, numerical_vars])
# Standardize the test data
testData[, numerical_vars] <- predict(preProcValues, testData[, numerical_vars])
# Calculate the number of points in train and test subsets
train_size <- nrow(trainData)
test_size <- nrow(testData)
# Create a data frame with the counts for waffle plot
data_proportion <- c(`Train Data` = train_size, `Test Data` = test_size)
# Create the waffle plot
waffle(data_proportion / sum(data_proportion) * 100,
rows = 10, # Adjust number of rows for granularity
title = "Proportion of Data Points in Training and Test Sets",
colors = c("#1E2B4F", "#F00000"))set.seed(12345)
# Set up the training control
# Exclude pid and syear to avoid overfitting
trainData <- trainData %>% select(-pid, -syear, -health_decline_2yrs)
testData <- testData %>% select(-pid, -syear, -health_decline_2yrs)
# Define cross-validation as the method and 10 folds
train_control <- trainControl(method = "cv", number = 10)
# Train the LASSO model
lasso_model <- train(
# use 'shealth_decline_2yrs' as the dependent and all others as independent variables
health_in_2yrs ~ .,
# specify the traininf data
data = trainData,
# We want to use the LASSO method (glmnet)
method = "glmnet",
# Use the predefined training specification
trControl = train_control,
# Define what values for lambda we should try (tuning)
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.5, by = 0.001))
)## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## alpha lambda
## 1 1 0.001
# Visualize the cross-validation results
ggplot(lasso_model) +
labs(
title = "LASSO Model Cross-Validation Results",
subtitle = "RMSE vs. Log(Lambda) Across Cross-Validation Folds",
x = "Log(Lambda)", # X-axis represents lambda values (log-scaled)
y = "RMSE" # Y-axis represents cross-validated RMSE
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis labels
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis labels
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)),
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 20, b = 20, l = 20) # Add spacing around the plot
)#Save Graph
ggsave("lasso_plot.png", plot = last_plot(), width = 16, height = 8, dpi = 300)
# Model Evaluation
# Predict on the test set
predictions <- predict(lasso_model, newdata = testData)
# Calculate RMSE and R-squared
rmse <- RMSE(predictions, testData$health_in_2yrs)
r2 <- R2(predictions, testData$health_in_2yrs)
# Print the evaluation metrics
print(paste0("RMSE: ", rmse))## [1] "RMSE: 0.719622144223455"
## [1] "R-squared: 0.357889936906547"
# Extract the non-zero coefficients of the best model
coefficients_regression <- coef(lasso_model$finalModel, s = lasso_model$bestTune$lambda)
non_zero_coefficients_regression <- coefficients_regression[coefficients_regression != 0]
print(non_zero_coefficients_regression)## [1] 2.653092e+00 -5.637167e-02 2.665504e-01 -6.411979e-02 -3.097652e-02
## [6] 4.338482e-02 7.188054e-02 1.359287e-03 1.034705e-05 8.269025e-04
## [11] -8.890505e-03 4.263171e-03 -1.836508e-02 6.551570e-03 -7.648840e-02
## [16] 3.774840e-02
#Plotting coefficients
# Convert sparse matrix to data frame and filter non-zero coefficients
non_zero_coefficients <- data.frame(
Variable = rownames(as.matrix(coefficients_regression)),
Coefficient = as.numeric(as.matrix(coefficients_regression))
) %>%
filter(Coefficient != 0) # Keep only non-zero coefficients
# Remove the intercept and rename variables
non_zero_coefficients <- non_zero_coefficients %>%
filter(Variable != "(Intercept)") %>% # Remove the intercept
mutate(
Variable = case_when( # Rename variables for cleaner labels
Variable == "health" ~ "Health",
Variable == "health_satisfaction" ~ "Health Satisfaction",
Variable == "life_satisfaction" ~ "Life Satisfaction",
Variable == "educ" ~ "Education",
Variable == "male" ~ "Gender (Male)",
Variable == "worr_job_dummy" ~ "Job Worries",
Variable == "tenure" ~ "Tenure",
Variable == "work_time_weekly" ~ "Weekly Work Hours",
Variable == "net_income" ~ "Net Income",
Variable == "total_years_unemployment" ~ "Total Years Unemployed",
Variable == "worr_financial_dummy" ~ "Financial Worries",
Variable == "smoking_dummy" ~ "Smoking",
Variable == "worr_health_dummy" ~ "Health Worries",
Variable == "BMI" ~ "BMI",
Variable == "age" ~ "Age",
TRUE ~ Variable # Keep any variable not explicitly renamed
)
)
# Plotting non-zero coefficients with cleaner labels
ggplot(non_zero_coefficients, aes(x = reorder(Variable, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity", color = "black") +
coord_flip() + # Flip coordinates for better readability
labs(
title = "Non-Zero Coefficients from Lasso Regression",
x = "",
y = "Coefficient Value"
) +
scale_fill_manual(
values = c("#FF0000", "#163E64"), # Red for negative, dark blue for positive
guide = "none" # Remove the legend
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_blank(), # Remove x-axis title
axis.title.y = element_blank(), # Remove y-axis title
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis text
axis.text.y = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis text
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space on the right and top/bottom
)#Save Graph
ggsave("lasso_coefficients.png", plot = last_plot(), width = 16, height = 8, dpi = 300)
# Visualize the lasso model
plot(lasso_model$finalModel, xvar = "lambda", label = TRUE)# Split the data into training and test data
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(lasso_data$health_decline_2yrs, p = 0.7, list = FALSE)
# Convert trainIndex to a vector
trainIndex <- as.vector(trainIndex)
# Use the vector to split the data
trainData <- lasso_data[trainIndex, ]
testData <- lasso_data[-trainIndex, ]
# Standardize numerical variables
# Define relevant numerical variables as a character vector
numerical_vars <- c(
"BMI", "smoking_dummy", "life_satisfaction", "health_satisfaction",
"tenure", "net_income", "work_time_weekly", "total_years_unemployment",
"worr_job_dummy", "worr_financial_dummy", "worr_economic_dummy",
"male", "age", "educ", "worr_health_dummy", "health"
)
# Apply pre-processing (center and scale) only to the selected numerical columns
preProcValues <- preProcess(trainData[, numerical_vars], method = c("center", "scale"))
# Standardize the training data
trainData[, numerical_vars] <- predict(preProcValues, trainData[, numerical_vars])
# Standardize the test data
testData[, numerical_vars] <- predict(preProcValues, testData[, numerical_vars])
# Exclude pid and syear to avoid overfitting
trainData <- trainData %>% select(-pid, -syear, -health_in_2yrs)
testData <- testData %>% select(-pid, -syear, health_in_2yrs)
# Convert health_decline_2yrs to a factor
trainData$health_decline_2yrs <- factor(trainData$health_decline_2yrs, levels = c(0, 1), labels = c("No", "Yes"))
testData$health_decline_2yrs <- factor(testData$health_decline_2yrs, levels = c(0, 1), labels = c("No", "Yes"))
# Set up the training control
train_control <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary)
# Train the LASSO model for classification
set.seed(12345)
lasso_model <- train(
health_decline_2yrs ~ ., # Dependent variable
data = trainData,
method = "glmnet", # Specify LASSO
family = "binomial", # Classification for binary outcome
trControl = train_control, # Use predefined training control
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.5, by = 0.001)), # LASSO tuning grid
metric = "ROC" # Use ROC as the evaluation metric
)
# Display the best lambda value and model summary
print(lasso_model$bestTune)## alpha lambda
## 1 1 0.001
# Extract the cross-validation results
cv_results <- lasso_model$results
# Create a ggplot visualization
ggplot(cv_results, aes(x = log(lambda), y = ROC, color = ROC)) +
geom_point(size = 2, alpha = 0.8) + # Add points for each lambda value
geom_line(size = 1) + # Add a line to connect points
labs(
title = "LASSO Model Cross-Validation Results",
subtitle = "ROC vs. Log(Lambda) Across Cross-Validation Folds",
x = "Log(Lambda)",
y = "ROC"
) +
scale_color_gradient(low = "#A6CEE3", high = "#1E2B4F") + # Gradient from light blue to dark blue
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis text
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis text
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title and subtitle styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
plot.subtitle = element_text(size = 14, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold subtitle
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20), # Add more space around the plot
# Remove legend
legend.position = "none"
)#Save Graph
ggsave("lasso_model_classification.png", plot = last_plot(), width = 16, height = 8, dpi = 300)
# Model Evaluation
# Predict probabilities on the test set
predictions <- predict(lasso_model, newdata = testData, type = "prob")
# Predict binary outcomes (threshold = 0.5)
predicted_classes <- ifelse(predictions[, "Yes"] > 0.5, "Yes", "No")
# Confusion matrix
conf_matrix <- confusionMatrix(factor(predicted_classes, levels = c("No", "Yes")),
factor(testData$health_decline_2yrs, levels = c("No", "Yes")))
print(conf_matrix)## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 16020 4240
## Yes 758 1279
##
## Accuracy : 0.7758
## 95% CI : (0.7703, 0.7813)
## No Information Rate : 0.7525
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2367
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9548
## Specificity : 0.2317
## Pos Pred Value : 0.7907
## Neg Pred Value : 0.6279
## Prevalence : 0.7525
## Detection Rate : 0.7185
## Detection Prevalence : 0.9086
## Balanced Accuracy : 0.5933
##
## 'Positive' Class : No
##
## [1] "Accuracy: 0.775844283984393"
## [1] "Sensitivity: 0.954821790439862"
## [1] "Specificity: 0.23174488131908"
# Convert sparse matrix to data frame and filter non-zero coefficients
coefficients <- coef(lasso_model$finalModel, s = lasso_model$bestTune$lambda)
non_zero_coefficients <- data.frame(
Variable = rownames(as.matrix(coefficients)),
Coefficient = as.numeric(as.matrix(coefficients))
) %>%
filter(Coefficient != 0) # Keep only non-zero coefficients
# Remove the intercept and rename variables
non_zero_coefficients <- non_zero_coefficients %>%
filter(Variable != "(Intercept)") %>% # Remove the intercept
mutate(
Variable = case_when( # Rename variables for cleaner labels
Variable == "health" ~ "Health",
Variable == "health_satisfaction" ~ "Health Satisfaction",
Variable == "life_satisfaction" ~ "Life Satisfaction",
Variable == "educ" ~ "Education Level",
Variable == "male" ~ "Gender (Male=1)",
Variable == "worr_job_dummy" ~ "Job Worries (Dummy)",
Variable == "tenure" ~ "Tenure",
Variable == "work_time_weekly" ~ "Work Time Weekly",
Variable == "net_income" ~ "Net Income",
Variable == "total_years_unemployment" ~ "Total Years Unemployed",
Variable == "worr_financial_dummy" ~ "Financial Worries (Dummy)",
Variable == "smoking_dummy" ~ "Smoking (Dummy)",
Variable == "worr_health_dummy" ~ "Worries About Health (Dummy)",
Variable == "BMI" ~ "BMI",
Variable == "age" ~ "Age",
TRUE ~ Variable # Keep any variable not explicitly renamed
)
)
# Plotting non-zero coefficients with cleaner labels
ggplot(non_zero_coefficients, aes(x = reorder(Variable, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity", color = "black") +
coord_flip() + # Flip coordinates for better readability
labs(
title = "Non-Zero Coefficients from Lasso Classification",
x = "",
y = "Coefficient Value"
) +
scale_fill_manual(
values = c("#FF0000", "#163E64"), # Red for negative, dark blue for positive
guide = "none" # Remove the legend
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.title.x = element_blank(), # Remove x-axis title
axis.title.y = element_blank(), # Remove y-axis title
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis text
axis.text.y = element_text(size = 12, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis text
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space around the plot
)# Create time-lagged variables
panel_data <- panel_data %>%
group_by(pid) %>%
arrange(syear) %>%
mutate(
health_lag1 = lag(health, 1), # Previous year's health
net_income_lag1 = lag(net_income, 1), # Previous income
work_time_weekly_lag1 = lag(work_time_weekly, 1),
tenure_lag1 = lag(tenure, 1)
) %>%
ungroup()
# Define relevant numerical variables as a character vector
numerical_vars <- c(
"BMI", "smoking_dummy", "life_satisfaction", "health_satisfaction",
"tenure", "net_income", "work_time_weekly", "total_years_unemployment",
"worr_job_dummy", "worr_financial_dummy", "worr_economic_dummy",
"male", "age", "educ", "worr_health_dummy", "health","health_lag1","net_income_lag1",
"work_time_weekly_lag1","tenure_lag1"
)
# Create rf_data by removing specific columns
rf_data <- panel_data %>%
dplyr::select(-pid, -syear, -health_in_2yrs)
# Ensure health_decline_2yrs Is a Factor
rf_data$health_decline_2yrs <- factor(rf_data$health_decline_2yrs, levels = c(0, 1), labels = c("No", "Yes"))
# Dynamically create the formula
class_tree_formula <- as.formula(paste("health_decline_2yrs ~", paste(numerical_vars, collapse = " + ")))
# Fit the decision tree
class_tree <- rpart(
formula = class_tree_formula,
data = rf_data,
method = "class",
control = rpart.control(
cp = 0.001, # Lower complexity parameter to allow more splits
minsplit = 10, # Minimum number of observations required to split
minbucket = 5, # Minimum observations in a terminal node
maxdepth = 10 # Maximum depth of the tree
)
)
# Plot the decision tree
rpart.plot(class_tree, main = "Decision Tree: Classification")# Plot the decision tree with explicit splits
rpart.plot(class_tree, main = "Decision Tree: Classification", type = 5)# Building the classification model
set.seed(123) # For reproducibility
# Create a training and test dataset
trainIndex <- createDataPartition (rf_data$health_decline_2yrs, p = 0.7, list = FALSE)
trainData <- rf_data[trainIndex, ]
testData <- rf_data[-trainIndex, ]
dim(trainData)## [1] 52029 36
## [1] 22297 36
# Creates a data frame (tuning_grid) where trees ranges from 5 to 400 in
# steps of 5. The rmse column is initialized with NA to be filled later.
tuning_grid <- expand.grid(
trees = seq(5, 400, by = 5),
rmse = NA
)
for(i in seq_len(nrow(tuning_grid))) { # Iterates over each row of the tuning_grid.
# Fits a random forest model
fit <- ranger(
formula = class_tree_formula,
data = trainData,
num.trees = tuning_grid$trees[i],
mtry = sqrt(5),
verbose = FALSE,
seed = 123
)
# Stores the RMSE of the model
tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
}# Plot the RMSE against the number of trees with enhanced theme
ggplot(tuning_grid, aes(x = trees, y = rmse)) +
geom_line(color = "#163E64", size = 1) + # Dark blue line
geom_point(color = "#1E2B4F", size = 2, alpha = 0.8) + # Points for emphasis
labs(
title = "Random Forest Model Tuning: RMSE vs. Number of Trees",
x = "Number of Trees",
y = "RMSE"
) +
theme_minimal(base_size = 14) +
theme(
# Backgrounds
panel.background = element_rect(fill = "white", color = NA), # White background
plot.background = element_rect(fill = "white", color = NA), # White outer background
# Gridlines
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# Axis styling
axis.text.x = element_text(size = 12, color = "#1E2B4F"), # Dark blue x-axis text
axis.text.y = element_text(size = 12, color = "#1E2B4F"), # Dark blue y-axis text
axis.title.x = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue x-axis title
axis.title.y = element_text(size = 14, face = "bold", color = "#1E2B4F"), # Bold and dark blue y-axis title
# Title styling
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, color = "#1E2B4F", margin = ggplot2::margin(b = 10)), # Centered and bold title
# Adjust overall plot margins
plot.margin = ggplot2::margin(t = 20, r = 100, b = 20, l = 20) # Add more space around the plot
)# Defines the training scheme to be 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
tune_grid <- expand.grid(
mtry = seq(2,4,0.05), # Number of variables randomly sampled at each split
splitrule = "gini", # Number of trees in the forest
min.node.size = c(1, 5, 10) # Minimum size of terminal nodes
)
# Train the Random Forest model using ranger in caret
set.seed(123) # Set seed for reproducibility
rf_model <- train(
class_tree_formula,
data = trainData,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid,
num.trees = 100 # that we got previously
)
# Print the best model and results
print(rf_model)## Random Forest
##
## 52029 samples
## 20 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 46825, 46826, 46826, 46826, 46826, 46827, ...
## Resampling results across tuning parameters:
##
## mtry min.node.size Accuracy Kappa
## 2.00 1 0.7762596 0.2423502
## 2.00 5 0.7757791 0.2420059
## 2.00 10 0.7753755 0.2413648
## 2.05 1 0.7760481 0.2425404
## 2.05 5 0.7763941 0.2430463
## 2.05 10 0.7761635 0.2433259
## 2.10 1 0.7758174 0.2405821
## 2.10 5 0.7753562 0.2391151
## 2.10 10 0.7759136 0.2406673
## 2.15 1 0.7753178 0.2395225
## 2.15 5 0.7755292 0.2390607
## 2.15 10 0.7758367 0.2421017
## 2.20 1 0.7754715 0.2394104
## 2.20 5 0.7760481 0.2413263
## 2.20 10 0.7765670 0.2436645
## 2.25 1 0.7756637 0.2406185
## 2.25 5 0.7757406 0.2390820
## 2.25 10 0.7759520 0.2417052
## 2.30 1 0.7760481 0.2426873
## 2.30 5 0.7760097 0.2414553
## 2.30 10 0.7755484 0.2394323
## 2.35 1 0.7754331 0.2401254
## 2.35 5 0.7758752 0.2415318
## 2.35 10 0.7758175 0.2416242
## 2.40 1 0.7752986 0.2389376
## 2.40 5 0.7760866 0.2429995
## 2.40 10 0.7755484 0.2410445
## 2.45 1 0.7758559 0.2414142
## 2.45 5 0.7756829 0.2417891
## 2.45 10 0.7756060 0.2417614
## 2.50 1 0.7757022 0.2392650
## 2.50 5 0.7756061 0.2409608
## 2.50 10 0.7757022 0.2412297
## 2.55 1 0.7748949 0.2385566
## 2.55 5 0.7752601 0.2402137
## 2.55 10 0.7751640 0.2407818
## 2.60 1 0.7756253 0.2386815
## 2.60 5 0.7755291 0.2403709
## 2.60 10 0.7758943 0.2423947
## 2.65 1 0.7754715 0.2409088
## 2.65 5 0.7754331 0.2399590
## 2.65 10 0.7756830 0.2417928
## 2.70 1 0.7755099 0.2394047
## 2.70 5 0.7757406 0.2420819
## 2.70 10 0.7761634 0.2415054
## 2.75 1 0.7763749 0.2440967
## 2.75 5 0.7758366 0.2429109
## 2.75 10 0.7754907 0.2402735
## 2.80 1 0.7758559 0.2406823
## 2.80 5 0.7760289 0.2412477
## 2.80 10 0.7760866 0.2420997
## 2.85 1 0.7753946 0.2396741
## 2.85 5 0.7753754 0.2399322
## 2.85 10 0.7751063 0.2404767
## 2.90 1 0.7758752 0.2418803
## 2.90 5 0.7751833 0.2398975
## 2.90 10 0.7762596 0.2443438
## 2.95 1 0.7762211 0.2432634
## 2.95 5 0.7750872 0.2392301
## 2.95 10 0.7754908 0.2406305
## 3.00 1 0.7762979 0.2512952
## 3.00 5 0.7767785 0.2541414
## 3.00 10 0.7766824 0.2539323
## 3.05 1 0.7762788 0.2513614
## 3.05 5 0.7763364 0.2521690
## 3.05 10 0.7765671 0.2517214
## 3.10 1 0.7762788 0.2523829
## 3.10 5 0.7771821 0.2555619
## 3.10 10 0.7756636 0.2487239
## 3.15 1 0.7762211 0.2511605
## 3.15 5 0.7766824 0.2521123
## 3.15 10 0.7768747 0.2534072
## 3.20 1 0.7758558 0.2504063
## 3.20 5 0.7757982 0.2502967
## 3.20 10 0.7765286 0.2527068
## 3.25 1 0.7762980 0.2527437
## 3.25 5 0.7768553 0.2544584
## 3.25 10 0.7765863 0.2525456
## 3.30 1 0.7759136 0.2491722
## 3.30 5 0.7774319 0.2551287
## 3.30 10 0.7751063 0.2470186
## 3.35 1 0.7764325 0.2525247
## 3.35 5 0.7762979 0.2507474
## 3.35 10 0.7766055 0.2520172
## 3.40 1 0.7761058 0.2516604
## 3.40 5 0.7765287 0.2516809
## 3.40 10 0.7756638 0.2486000
## 3.45 1 0.7761250 0.2521445
## 3.45 5 0.7775665 0.2571149
## 3.45 10 0.7770284 0.2531448
## 3.50 1 0.7761634 0.2505725
## 3.50 5 0.7760289 0.2495523
## 3.50 10 0.7766440 0.2519198
## 3.55 1 0.7757598 0.2499307
## 3.55 5 0.7762597 0.2517382
## 3.55 10 0.7766439 0.2524883
## 3.60 1 0.7766440 0.2528759
## 3.60 5 0.7764902 0.2520897
## 3.60 10 0.7762404 0.2531191
## 3.65 1 0.7759137 0.2498898
## 3.65 5 0.7763941 0.2516598
## 3.65 10 0.7776434 0.2563267
## 3.70 1 0.7766054 0.2528090
## 3.70 5 0.7765670 0.2534486
## 3.70 10 0.7767207 0.2521568
## 3.75 1 0.7774127 0.2549828
## 3.75 5 0.7762211 0.2518634
## 3.75 10 0.7767784 0.2518003
## 3.80 1 0.7757598 0.2490717
## 3.80 5 0.7768361 0.2538232
## 3.80 10 0.7764133 0.2517616
## 3.85 1 0.7765286 0.2524457
## 3.85 5 0.7768746 0.2537274
## 3.85 10 0.7758945 0.2496873
## 3.90 1 0.7763942 0.2511008
## 3.90 5 0.7764517 0.2511859
## 3.90 10 0.7763941 0.2515130
## 3.95 1 0.7766055 0.2519689
## 3.95 5 0.7761059 0.2510110
## 3.95 10 0.7771822 0.2536184
## 4.00 1 0.7764517 0.2609711
## 4.00 5 0.7765285 0.2601853
## 4.00 10 0.7749333 0.2544787
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3.65, splitrule = gini
## and min.node.size = 10.
# Confusion matrix to evaluate performance
confusionMatrix(predictions, testData$health_decline_2yrs, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 15944 4157
## Yes 808 1388
##
## Accuracy : 0.7773
## 95% CI : (0.7718, 0.7828)
## No Information Rate : 0.7513
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2532
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.25032
## Specificity : 0.95177
## Pos Pred Value : 0.63206
## Neg Pred Value : 0.79319
## Prevalence : 0.24869
## Detection Rate : 0.06225
## Detection Prevalence : 0.09849
## Balanced Accuracy : 0.60104
##
## 'Positive' Class : Yes
##
# Fit the final model 'rf_model'
rf_final <- ranger(
formula = class_tree_formula,
data = trainData,
num.trees = 100, # Number of trees
probability = TRUE, # Predict probabilities
importance = "impurity", # Variable importance measure
mtry = 3.15, # Number of variables randomly sampled at each split
min.node.size = 10, # Minimum node size
seed = 123 # For reproducibility
)
# Get feature importance
v1 <- vip(rf_final) # Calculate and plot the variable importance
v1$data # Print the variable importance values# Plot of feature importance
v1_table <- v1$data
# Clean and rename variables
v1_table <- v1_table %>%
filter(Variable != "(Intercept)") %>% # Remove the intercept
mutate(
Variable = case_when( # Rename variables for cleaner labels
Variable == "health" ~ "Health",
Variable == "health_lag1" ~ "Health of previous year",
Variable == "health_satisfaction" ~ "Health Satisfaction",
Variable == "life_satisfaction" ~ "Life Satisfaction",
Variable == "educ" ~ "Education Level",
Variable == "male" ~ "Gender (Male=1)",
Variable == "worr_job_dummy" ~ "Job Worries (Dummy)",
Variable == "tenure" ~ "Tenure",
Variable == "tenure_lag1" ~ "Tenure of previous year",
Variable == "work_time_weekly" ~ "Weekly Work Hours",
Variable == "work_time_weekly_lag1" ~ "Weekly Work Hours of previous year",
Variable == "net_income" ~ "Net Income",
Variable == "net_income_lag1" ~ "Net Income of previous year",
Variable == "total_years_unemployment" ~ "Total Years Unemployed",
Variable == "worr_financial_dummy" ~ "Financial Worries (Dummy)",
Variable == "smoking_dummy" ~ "Smoking",
Variable == "worr_health_dummy" ~ "Health Worries (Dummy)",
Variable == "BMI" ~ "BMI",
Variable == "age" ~ "Age",
TRUE ~ Variable # Keep any variable not explicitly renamed
)
)
# Create the bar plot
ggplot(v1_table, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", position = "dodge", fill = "#1E2B4F", color = "black") +
coord_flip() + # Flip coordinates for better readability
labs(
title = "Variable Importance from Random Forest",
x = "",
y = "Importance"
) +
theme_minimal(base_size = 14) +
theme(
panel.background = element_rect(fill = "white", color = NA), # Dark blue background
plot.background = element_rect(fill = "white", color = NA), # Dark blue outer background
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(size = 12, color = "#1E2B4F"),
axis.text.y = element_text(size = 12, face = "bold", color = "#1E2B4F"),
plot.title = element_text(face = "bold", size = 16, hjust = 0.25, color = "#1E2B4F"), # Shift title left and color
plot.caption = element_text(color = "#1E2B4F"),
axis.title = element_text(color = "#1E2B4F"),
legend.position = "none" # Remove the legend
)# Calculate the Partial Dependence for health_lag1
pdp_health_lag1 <- partial(
object = rf_final,
pred.var = "health_lag1",
train = trainData,
prob = TRUE
)
# Calculate the Partial Dependence for male
pdp_male <- partial(
object = rf_final,
pred.var = "male",
train = trainData,
prob = TRUE
)
# Calculate the Partial Dependence for bmi
pdp_bmi <- partial(
object = rf_final,
pred.var = "BMI",
train = trainData,
prob = TRUE
)
#Transform tenure_lag1 into numeric
trainData$tenure_lag1 <- as.numeric(trainData$tenure_lag1)
# Calculate the Partial Dependence for tenure
pdp_tenure <- partial(
object = rf_final,
pred.var = "tenure_lag1",
train = trainData,
prob = TRUE
)
#Transform life_satisfaction into numeric
trainData$life_satisfaction <- as.numeric(trainData$life_satisfaction)
# Calculate the Partial Dependence for Life Satisfaction
pdp_life <- partial(
object = rf_final,
pred.var = "life_satisfaction",
train = trainData,
prob = TRUE
)
# Calculate the Partial Dependence for age
pdp_age <- partial(
object = rf_final,
pred.var = "age",
train = trainData,
prob = TRUE
)# Plot the PDP for health_lag1
p1 <- autoplot(pdp_health_lag1) +
ylab("Health Decline Prob")+
xlab("Health in previous year")+
theme_minimal()
# Plot the PDP for male
p2 <- autoplot(pdp_male) +
ylab("Health Decline Prob")+
xlab("Male")+
theme_minimal()
# Plot the PDP for bmi
p3 <- autoplot(pdp_bmi) +
ylab("Health Decline Prob")+
xlab("BMI")+
theme_minimal()
# Plot the PDP for tenure
p4 <- autoplot(pdp_tenure) +
ylab("Health Decline Prob")+
xlab("Tenure of previous year")+
theme_minimal()
# Plot the PDP for life_satisfaction
p5 <- autoplot(pdp_life) +
ylab("Health Decline Prob")+
xlab("Life Satisfaction")+
theme_minimal()
# Plot the PDP for age
p6 <- autoplot(pdp_age) +
ylab("Health Decline Prob")+
xlab("Age")+
theme_minimal()
# Combine the plots
(p1 | p2 ) /
(p3 | p4 ) /
(p5 | p6 )